diff --git a/.github/codeql/codeql-config.yml b/.github/codeql/codeql-config.yml new file mode 100644 index 0000000000..125d2be5b1 --- /dev/null +++ b/.github/codeql/codeql-config.yml @@ -0,0 +1,2 @@ +paths-ignore: + - '**/_deps/**' \ No newline at end of file diff --git a/.github/docker/README.md b/.github/docker/README.md index 782dce372e..81adbc8f1a 100644 --- a/.github/docker/README.md +++ b/.github/docker/README.md @@ -8,10 +8,10 @@ development environment. # How to build docker image -To build docker image on local machine execute: +To build docker image on local machine, enter the root dir of the repository and execute: ```sh -docker build -t ur:ubuntu-22.04 -f ./ubuntu-22.04.Dockerfile . +docker build -t ur:ubuntu-22.04 -f .github/docker/ubuntu-22.04.Dockerfile . ``` To set any build time variable (e.g., an optional ARG from docker recipe), add to the command (after `build`), e.g.: diff --git a/.github/docker/fedora-40.Dockerfile b/.github/docker/fedora-40.Dockerfile new file mode 100644 index 0000000000..70f77345fa --- /dev/null +++ b/.github/docker/fedora-40.Dockerfile @@ -0,0 +1,82 @@ +# Copyright (C) 2023-2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# +# Dockerfile - a 'recipe' for Docker to build an image of fedora-based +# environment for building the Unified Runtime project. +# + +# Pull base image ("40") +FROM registry.hub.docker.com/library/fedora@sha256:5ce8497aeea599bf6b54ab3979133923d82aaa4f6ca5ced1812611b197c79eb0 + +# Set environment variables +ENV OS fedora +ENV OS_VER 40 +ENV NOTTY 1 + +# Additional parameters to build docker without building components. +# These ARGs can be set in docker building phase and are used +# within bash scripts (executed within docker). +ARG SKIP_DPCPP_BUILD +ARG SKIP_LIBBACKTRACE_BUILD + +# Base development packages +ARG BASE_DEPS="\ + cmake \ + git \ + make" + +# Unified Runtime's dependencies +ARG UR_DEPS="\ + doxygen \ + python3 \ + python3-pip" + +# Miscellaneous for our builds/CI (optional) +ARG MISC_DEPS="\ + clang \ + ncurses-libs-6.4 \ + passwd \ + sudo \ + wget" + +# Update and install required packages +RUN dnf update -y \ + && dnf install -y \ + ${BASE_DEPS} \ + ${UR_DEPS} \ + ${MISC_DEPS} \ + && dnf clean all + +# Prepare a dir (accessible by anyone) +RUN mkdir --mode 777 /opt/ur/ + +# Additional dev. dependencies (installed via pip) +# +# It's actively used and tested only on selected distros. Be aware +# they may not work, because pip packages list differ from OS to OS. +COPY third_party/requirements.txt /opt/ur/requirements.txt + +# Install DPC++ +COPY .github/docker/install_dpcpp.sh /opt/ur/install_dpcpp.sh +ENV DPCPP_PATH=/opt/dpcpp +RUN /opt/ur/install_dpcpp.sh + +# Install libbacktrace +COPY .github/docker/install_libbacktrace.sh /opt/ur/install_libbacktrace.sh +RUN /opt/ur/install_libbacktrace.sh + +# Add a new (non-root) 'test_user' +ENV USER test_user +ENV USERPASS pass +# Change shell to bash with safe pipe usage +SHELL [ "/bin/bash", "-o", "pipefail", "-c" ] +RUN useradd -m ${USER} \ + && echo "${USER}:${USERPASS}" | chpasswd \ + && gpasswd wheel -a ${USER} + +# Change shell back to default and switch to 'test_user' +SHELL ["/bin/sh", "-c"] +USER test_user diff --git a/.github/docker/install_dpcpp.sh b/.github/docker/install_dpcpp.sh index 0aac93eee4..87548a5b64 100755 --- a/.github/docker/install_dpcpp.sh +++ b/.github/docker/install_dpcpp.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Copyright (C) 2023 Intel Corporation +# Copyright (C) 2023-2024 Intel Corporation # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception @@ -15,9 +15,6 @@ if [ "${SKIP_DPCPP_BUILD}" ]; then exit fi -apt-get install -y --no-install-recommends \ - libncurses5 - -mkdir -p ${DPCPP_PATH} -wget -O ${DPCPP_PATH}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/sycl-nightly%2F20230626/dpcpp-compiler.tar.gz -tar -xvf ${DPCPP_PATH}/dpcpp_compiler.tar.gz -C ${DPCPP_PATH}/ +mkdir -p ${DPCPP_PATH}/dpcpp_compiler +wget -O ${DPCPP_PATH}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-09-27/sycl_linux.tar.gz +tar -xvf ${DPCPP_PATH}/dpcpp_compiler.tar.gz -C ${DPCPP_PATH}/dpcpp_compiler diff --git a/.github/docker/opensuse-leap-15.Dockerfile b/.github/docker/opensuse-leap-15.Dockerfile new file mode 100644 index 0000000000..62a09b27ef --- /dev/null +++ b/.github/docker/opensuse-leap-15.Dockerfile @@ -0,0 +1,92 @@ +# Copyright (C) 2023-2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# +# Dockerfile - a 'recipe' for Docker to build an image of opensuse-leap-based +# environment for building the Unified Runtime project. +# + +# Pull base image ("15") +FROM registry.hub.docker.com/opensuse/leap@sha256:1cf79e78bb69f39fb2f78a7c2c7ebc4b64cf8d82eb1df76cd36767a595ada7a8 + +# Set environment variables +ENV OS opensuse-leap +ENV OS_VER 15 +ENV NOTTY 1 + +# Additional parameters to build docker without building components. +# These ARGs can be set in docker building phase and are used +# within bash scripts (executed within docker). +ARG SKIP_DPCPP_BUILD +ARG SKIP_LIBBACKTRACE_BUILD + +# Base development packages +ARG BASE_DEPS="\ + cmake \ + gcc \ + gcc-c++ \ + git \ + glibc-devel \ + libstdc++-devel \ + make" + +# Unified Runtime's dependencies +ARG UR_DEPS="\ + doxygen \ + python3 \ + python3-devel \ + python3-pip" + +# Miscellaneous for our builds/CI (optional) +ARG MISC_DEPS="\ + clang \ + gzip \ + libncurses5 \ + sudo \ + tar \ + wget" + +# add openSUSE Leap 15.5 Oss repo +RUN zypper ar -f https://download.opensuse.org/distribution/leap/15.5/repo/oss/ oss + +# Update and install required packages +RUN zypper update -y \ + && zypper install -y \ + ${BASE_DEPS} \ + ${UR_DEPS} \ + ${MISC_DEPS} \ + && zypper clean all + +# Prepare a dir (accessible by anyone) +RUN mkdir --mode 777 /opt/ur/ + +# Additional dev. dependencies (installed via pip) +# +# It's actively used and tested only on selected distros. Be aware +# they may not work, because pip packages list differ from OS to OS. +COPY third_party/requirements.txt /opt/ur/requirements.txt + +# Install DPC++ +COPY .github/docker/install_dpcpp.sh /opt/ur/install_dpcpp.sh +ENV DPCPP_PATH=/opt/dpcpp +RUN /opt/ur/install_dpcpp.sh + +# Install libbacktrace +COPY .github/docker/install_libbacktrace.sh /opt/ur/install_libbacktrace.sh +RUN /opt/ur/install_libbacktrace.sh + +# Add a new (non-root) 'test_user' and switch to it +ENV USER test_user +ENV USERPASS pass +ENV PFILE ./password +RUN useradd -m ${USER} \ + && echo ${USERPASS} > ${PFILE} \ + && echo ${USERPASS} >> ${PFILE} \ + && passwd ${USER} < ${PFILE} \ + && rm -f ${PFILE} \ + && sed -i 's/# %wheel/%wheel/g' /etc/sudoers \ + && groupadd wheel \ + && gpasswd wheel -a ${USER} +USER test_user diff --git a/.github/docker/rockylinux-8.Dockerfile b/.github/docker/rockylinux-8.Dockerfile new file mode 100644 index 0000000000..7581cf5bd7 --- /dev/null +++ b/.github/docker/rockylinux-8.Dockerfile @@ -0,0 +1,93 @@ +# Copyright (C) 2023-2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# +# Dockerfile - a 'recipe' for Docker to build an image of rockylinux-based +# environment for building the Unified Runtime project. +# + +# Pull base image ("8.9") +FROM registry.hub.docker.com/library/rockylinux@sha256:9794037624aaa6212aeada1d28861ef5e0a935adaf93e4ef79837119f2a2d04c + +# Set environment variables +ENV OS rockylinux +ENV OS_VER 8 +ENV NOTTY 1 + +# Additional parameters to build docker without building components. +# These ARGs can be set in docker building phase and are used +# within bash scripts (executed within docker). +ARG SKIP_DPCPP_BUILD +ARG SKIP_LIBBACKTRACE_BUILD + +# Base development packages +ARG BASE_DEPS="\ + cmake \ + git \ + glibc-devel \ + libstdc++-devel \ + make" + +# Unified Runtime's dependencies +ARG UR_DEPS="\ + doxygen \ + python3 \ + python3-pip" + +# Packages required by requirements.txt +ARG PRE_PYTHON_DEPS="\ + libjpeg-turbo-devel \ + python3-devel \ + python3-wheel \ + zlib-devel" + +# Miscellaneous for our builds/CI (optional) +ARG MISC_DEPS="\ + clang \ + ncurses-libs-6.1 \ + passwd \ + sudo \ + wget" + +# Update and install required packages +RUN dnf update -y \ + && dnf --enablerepo devel install -y \ + ${BASE_DEPS} \ + ${UR_DEPS} \ + ${PRE_PYTHON_DEPS} \ + ${MISC_DEPS} \ + && dnf clean all + +# Prepare a dir (accessible by anyone) +RUN mkdir --mode 777 /opt/ur/ + +# Additional dev. dependencies (installed via pip) +# +# It's actively used and tested only on selected distros. Be aware +# they may not work, because pip packages list differ from OS to OS. +COPY third_party/requirements.txt /opt/ur/requirements.txt + +# Install DPC++ +COPY .github/docker/install_dpcpp.sh /opt/ur/install_dpcpp.sh +ENV DPCPP_PATH=/opt/dpcpp +RUN /opt/ur/install_dpcpp.sh + +# Install libbacktrace +COPY .github/docker/install_libbacktrace.sh /opt/ur/install_libbacktrace.sh +RUN /opt/ur/install_libbacktrace.sh + +# Add a new (non-root) 'test_user' +ENV USER test_user +ENV USERPASS pass +# Change shell to bash with safe pipe usage +SHELL [ "/bin/bash", "-o", "pipefail", "-c" ] +RUN useradd -m $USER \ + && echo "${USERPASS}" | passwd "${USER}" --stdin \ + && gpasswd wheel -a "${USER}" \ + && echo "%wheel ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers + +# Change shell back to default and switch to 'test_user' +SHELL ["/bin/sh", "-c"] +USER test_user diff --git a/.github/docker/rockylinux-9.Dockerfile b/.github/docker/rockylinux-9.Dockerfile new file mode 100644 index 0000000000..171e315cbe --- /dev/null +++ b/.github/docker/rockylinux-9.Dockerfile @@ -0,0 +1,85 @@ +# Copyright (C) 2023-2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# +# Dockerfile - a 'recipe' for Docker to build an image of rockylinux-based +# environment for building the Unified Runtime project. +# + +# Pull base image ("9.3") +FROM registry.hub.docker.com/library/rockylinux@sha256:d7be1c094cc5845ee815d4632fe377514ee6ebcf8efaed6892889657e5ddaaa6 + +# Set environment variables +ENV OS rockylinux +ENV OS_VER 9 +ENV NOTTY 1 + +# Additional parameters to build docker without building components. +# These ARGs can be set in docker building phase and are used +# within bash scripts (executed within docker). +ARG SKIP_DPCPP_BUILD +ARG SKIP_LIBBACKTRACE_BUILD + +# Base development packages +ARG BASE_DEPS="\ + cmake \ + git \ + glibc-devel \ + libstdc++-devel \ + make" + +# Unified Runtime's dependencies +ARG UR_DEPS="\ + doxygen \ + python3 \ + python3-pip" + +# Miscellaneous for our builds/CI (optional) +ARG MISC_DEPS="\ + clang \ + ncurses-libs-6.2 \ + passwd \ + sudo \ + wget" + +# Update and install required packages +RUN dnf update -y \ + && dnf --enablerepo devel install -y \ + ${BASE_DEPS} \ + ${UR_DEPS} \ + ${MISC_DEPS} \ + && dnf clean all + +# Prepare a dir (accessible by anyone) +RUN mkdir --mode 777 /opt/ur/ + +# Additional dev. dependencies (installed via pip) +# +# It's actively used and tested only on selected distros. Be aware +# they may not work, because pip packages list differ from OS to OS. +COPY third_party/requirements.txt /opt/ur/requirements.txt + +# Install DPC++ +COPY .github/docker/install_dpcpp.sh /opt/ur/install_dpcpp.sh +ENV DPCPP_PATH=/opt/dpcpp +RUN /opt/ur/install_dpcpp.sh + +# Install libbacktrace +COPY .github/docker/install_libbacktrace.sh /opt/ur/install_libbacktrace.sh +RUN /opt/ur/install_libbacktrace.sh + +# Add a new (non-root) 'test_user' +ENV USER test_user +ENV USERPASS pass +# Change shell to bash with safe pipe usage +SHELL [ "/bin/bash", "-o", "pipefail", "-c" ] +RUN useradd -m $USER \ + && echo "${USERPASS}" | passwd "${USER}" --stdin \ + && gpasswd wheel -a "${USER}" \ + && echo "%wheel ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers + +# Change shell back to default and switch to 'test_user' +SHELL ["/bin/sh", "-c"] +USER test_user diff --git a/.github/docker/ubuntu-20.04.Dockerfile b/.github/docker/ubuntu-20.04.Dockerfile new file mode 100644 index 0000000000..2560bb10b9 --- /dev/null +++ b/.github/docker/ubuntu-20.04.Dockerfile @@ -0,0 +1,75 @@ +# Copyright (C) 2023-2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# +# Dockerfile - a 'recipe' for Docker to build an image of ubuntu-based +# environment for building the Unified Runtime project. +# + +# Pull base image ("20.04") +FROM registry.hub.docker.com/library/ubuntu@sha256:d86db849e59626d94f768c679aba441163c996caf7a3426f44924d0239ffe03f + +# Set environment variables +ENV OS ubuntu +ENV OS_VER 20.04 +ENV NOTTY 1 +ENV DEBIAN_FRONTEND noninteractive + +# Additional parameters to build docker without building components. +# These ARGs can be set in docker building phase and are used +# within bash scripts (executed within docker). +ARG SKIP_DPCPP_BUILD +ARG SKIP_LIBBACKTRACE_BUILD + +# Base development packages +ARG BASE_DEPS="\ + build-essential \ + git" + +# Unified Runtime's dependencies +ARG UR_DEPS="\ + doxygen \ + python3 \ + python3-pip" + +# Miscellaneous for our builds/CI (optional) +ARG MISC_DEPS="\ + clang \ + g++-7 \ + libncurses5 \ + sudo \ + wget \ + whois" + +# Update and install required packages +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + ${BASE_DEPS} \ + ${UR_DEPS} \ + ${MISC_DEPS} \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean all + +# Install CMake from source (the version in apt it's too old) +RUN wget https://cmake.org/files/v3.20/cmake-3.20.0-linux-x86_64.sh -O cmake.sh \ + && chmod +x cmake.sh \ + && ./cmake.sh --skip-license --prefix=/usr/local + +# Prepare a dir (accessible by anyone) +RUN mkdir --mode 777 /opt/ur/ + +# Additional dev. dependencies (installed via pip) +COPY third_party/requirements.txt /opt/ur/requirements.txt +RUN pip3 install --no-cache-dir -r /opt/ur/requirements.txt + +# Install libbacktrace +COPY .github/docker/install_libbacktrace.sh /opt/ur/install_libbacktrace.sh +RUN /opt/ur/install_libbacktrace.sh + +# Add a new (non-root) 'test_user' and switch to it +ENV USER test_user +ENV USERPASS pass +RUN useradd -m "${USER}" -g sudo -p "$(mkpasswd ${USERPASS})" +USER test_user diff --git a/.github/docker/ubuntu-22.04.Dockerfile b/.github/docker/ubuntu-22.04.Dockerfile index 09eaab03a8..d4b3a828fc 100644 --- a/.github/docker/ubuntu-22.04.Dockerfile +++ b/.github/docker/ubuntu-22.04.Dockerfile @@ -4,11 +4,12 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -# Dockerfile - image with all Unified Runtime dependencies. +# Dockerfile - a 'recipe' for Docker to build an image of ubuntu-based +# environment for building the Unified Runtime project. # -# Pull base image -FROM registry.hub.docker.com/library/ubuntu:22.04 +# Pull base image ("22.04") +FROM registry.hub.docker.com/library/ubuntu@sha256:0eb0f877e1c869a300c442c41120e778db7161419244ee5cbc6fa5f134e74736 # Set environment variables ENV OS ubuntu @@ -32,15 +33,13 @@ ARG BASE_DEPS="\ ARG UR_DEPS="\ doxygen \ python3 \ - python3-pip" - -# Unified Runtime's dependencies (installed via pip) -ARG UR_PYTHON_DEPS="\ - clang-format==15.0.7" + python3-pip \ + libhwloc-dev" # Miscellaneous for our builds/CI (optional) ARG MISC_DEPS="\ clang \ + libncurses5 \ sudo \ wget \ whois" @@ -54,18 +53,21 @@ RUN apt-get update \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean all -# pip package is pinned to a version, but it's probably improperly parsed here -# hadolint ignore=DL3013 -RUN pip3 install --no-cache-dir ${UR_PYTHON_DEPS} +# Prepare a dir (accessible by anyone) +RUN mkdir --mode 777 /opt/ur/ + +# Additional dev. dependencies (installed via pip) +COPY third_party/requirements.txt /opt/ur/requirements.txt +RUN pip3 install --no-cache-dir -r /opt/ur/requirements.txt # Install DPC++ -COPY install_dpcpp.sh /opt/install_dpcpp.sh +COPY .github/docker/install_dpcpp.sh /opt/ur/install_dpcpp.sh ENV DPCPP_PATH=/opt/dpcpp -RUN /opt/install_dpcpp.sh +RUN /opt/ur/install_dpcpp.sh # Install libbacktrace -COPY install_libbacktrace.sh /opt/install_libbacktrace.sh -RUN /opt/install_libbacktrace.sh +COPY .github/docker/install_libbacktrace.sh /opt/ur/install_libbacktrace.sh +RUN /opt/ur/install_libbacktrace.sh # Add a new (non-root) 'test_user' and switch to it ENV USER test_user diff --git a/.github/docker/ubuntu-24.04.Dockerfile b/.github/docker/ubuntu-24.04.Dockerfile new file mode 100644 index 0000000000..6d232e1296 --- /dev/null +++ b/.github/docker/ubuntu-24.04.Dockerfile @@ -0,0 +1,75 @@ +# Copyright (C) 2023-2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# +# Dockerfile - a 'recipe' for Docker to build an image of ubuntu-based +# environment for building the Unified Runtime project. +# + +# Pull base image ("24.04") +FROM registry.hub.docker.com/library/ubuntu@sha256:340d9b015b194dc6e2a13938944e0d016e57b9679963fdeb9ce021daac430221 + +# Set environment variables +ENV OS ubuntu +ENV OS_VER 24.04 +ENV NOTTY 1 +ENV DEBIAN_FRONTEND noninteractive + +# Additional parameters to build docker without building components. +# These ARGs can be set in docker building phase and are used +# within bash scripts (executed within docker). +ARG SKIP_DPCPP_BUILD +ARG SKIP_LIBBACKTRACE_BUILD + +# Base development packages +ARG BASE_DEPS="\ + build-essential \ + cmake \ + git" + +# Unified Runtime's dependencies +ARG UR_DEPS="\ + doxygen \ + python3 \ + python3-pip" + +# Miscellaneous for our builds/CI (optional) +ARG MISC_DEPS="\ + clang \ + libncurses5 \ + sudo \ + wget \ + whois" + +# Update and install required packages +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + ${BASE_DEPS} \ + ${UR_DEPS} \ + ${MISC_DEPS} \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean all + +# Prepare a dir (accessible by anyone) +RUN mkdir --mode 777 /opt/ur/ + +# Additional dev. dependencies (installed via pip) +COPY third_party/requirements.txt /opt/ur/requirements.txt +RUN pip3 install --no-cache-dir -r /opt/ur/requirements.txt + +# Install DPC++ +COPY .github/docker/install_dpcpp.sh /opt/ur/install_dpcpp.sh +ENV DPCPP_PATH=/opt/dpcpp +RUN /opt/ur/install_dpcpp.sh + +# Install libbacktrace +COPY .github/docker/install_libbacktrace.sh /opt/ur/install_libbacktrace.sh +RUN /opt/ur/install_libbacktrace.sh + +# Add a new (non-root) 'test_user' and switch to it +ENV USER test_user +ENV USERPASS pass +RUN useradd -m "${USER}" -g sudo -p "$(mkpasswd ${USERPASS})" +USER test_user diff --git a/.github/scripts/install_hwloc.sh b/.github/scripts/install_hwloc.sh new file mode 100755 index 0000000000..c3299f5881 --- /dev/null +++ b/.github/scripts/install_hwloc.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +# Copyright (C) 2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# install_hwloc.sh - Script for building and installing HWLOC library from source code + +set -e + +git clone -b hwloc-2.3.0 https://github.com/open-mpi/hwloc.git +pushd hwloc +./autogen.sh +./configure +make -j$(nproc) +sudo make install -j$(nproc) +popd diff --git a/.github/workflows/benchmarks_compute.yml b/.github/workflows/benchmarks_compute.yml index 619784b263..ee74a52ad0 100644 --- a/.github/workflows/benchmarks_compute.yml +++ b/.github/workflows/benchmarks_compute.yml @@ -12,6 +12,7 @@ on: default: 'level_zero' options: - level_zero + - level_zero_v2 unit: description: Test unit (cpu/gpu) type: choice @@ -34,6 +35,16 @@ on: type: string required: false default: '' + sycl_repo: + description: 'Compiler repo' + type: string + required: true + default: 'intel/llvm' + sycl_commit: + description: 'Compiler commit' + type: string + required: false + default: '' permissions: contents: read @@ -41,15 +52,14 @@ permissions: jobs: e2e-build-hw: - # Run only on upstream; forks will not have the HW - # if: github.repository == 'oneapi-src/unified-runtime' name: Build SYCL, UR, run Compute Benchmarks strategy: matrix: adapter: [ {str_name: "${{inputs.str_name}}", sycl_config: "${{inputs.sycl_config_params}}", - unit: "${{inputs.unit}}"} + unit: "${{inputs.unit}}" + } ] build_type: [Release] compiler: [{c: clang, cxx: clang++}] @@ -105,12 +115,19 @@ jobs: - name: Checkout SYCL uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: - repository: intel/llvm + repository: ${{inputs.sycl_repo}} ref: refs/heads/sycl path: sycl-repo fetch-depth: 1 fetch-tags: false + - name: Fetch specific SYCL commit + if: inputs.sycl_commit != '' + working-directory: ./sycl-repo + run: | + git fetch --depth=1 origin ${{ inputs.sycl_commit }} + git checkout ${{ inputs.sycl_commit }} + - name: Set CUDA env vars if: matrix.adapter.str_name == 'cuda' run: | @@ -126,21 +143,40 @@ jobs: --ci-defaults ${{matrix.adapter.sycl_config}} --cmake-opt="-DLLVM_INSTALL_UTILS=ON" --cmake-opt="-DSYCL_PI_TESTS=OFF" - --cmake-opt="-DSYCL_PI_UR_USE_FETCH_CONTENT=OFF" - --cmake-opt="-DSYCL_PI_UR_SOURCE_DIR=${{github.workspace}}/ur-repo/" + --cmake-opt="-DSYCL_UR_USE_FETCH_CONTENT=OFF" + --cmake-opt="-DSYCL_UR_SOURCE_DIR=${{github.workspace}}/ur-repo/" --cmake-opt=-DCMAKE_C_COMPILER_LAUNCHER=ccache --cmake-opt=-DCMAKE_CXX_COMPILER_LAUNCHER=ccache - name: Build SYCL run: cmake --build ${{github.workspace}}/sycl_build -j - - name: Set oneAPI Device Selector - run: | - echo "ONEAPI_DEVICE_SELECTOR=${{ matrix.adapter.str_name }}:${{ matrix.adapter.unit }}" >> $GITHUB_ENV + - name: Configure UR + run: > + cmake -DCMAKE_BUILD_TYPE=Release + -S${{github.workspace}}/ur-repo + -B${{github.workspace}}/ur_build + -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/ur_install + -DUR_BUILD_TESTS=OFF + -DUR_BUILD_ADAPTER_L0=ON + -DUR_BUILD_ADAPTER_L0_V2=ON + -DUMF_DISABLE_HWLOC=ON + + - name: Build UR + run: cmake --build ${{github.workspace}}/ur_build -j $(nproc) + + - name: Install UR + run: cmake --install ${{github.workspace}}/ur_build - name: Run benchmarks id: benchmarks - run: numactl -N 0 ${{ github.workspace }}/ur-repo/scripts/benchmarks/main.py ~/bench_workdir ${{github.workspace}}/sycl_build ${{ inputs.bench_script_params }} + run: > + numactl -N 0 ${{ github.workspace }}/ur-repo/scripts/benchmarks/main.py + ~/bench_workdir + ${{github.workspace}}/sycl_build + ${{github.workspace}}/ur_install + ${{ matrix.adapter.str_name }} + ${{ inputs.bench_script_params }} - name: Add comment to PR uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 diff --git a/.github/workflows/build-hw-reusable.yml b/.github/workflows/build-hw-reusable.yml index 1154b98c42..77f696b958 100644 --- a/.github/workflows/build-hw-reusable.yml +++ b/.github/workflows/build-hw-reusable.yml @@ -4,7 +4,10 @@ name: Build - Adapters on HW - Reusable on: workflow_call: inputs: - name: + adapter_name: + required: true + type: string + runner_name: required: true type: string platform: @@ -15,6 +18,10 @@ on: required: false type: string default: OFF + static_adapter: + required: false + type: string + default: OFF permissions: contents: read @@ -33,7 +40,7 @@ jobs: strategy: matrix: adapter: [ - {name: "${{inputs.name}}", platform: "${{inputs.platform}}", static_Loader: "${{inputs.static_loader}}"}, + {name: "${{inputs.adapter_name}}", platform: "${{inputs.platform}}", static_Loader: "${{inputs.static_loader}}", static_adapter: "${{inputs.static_loader}}"}, ] build_type: [Debug, Release] compiler: [{c: gcc, cxx: g++}, {c: clang, cxx: clang++}] @@ -46,8 +53,12 @@ jobs: build_type: Release - adapter: {static_Loader: ON} compiler: {c: clang, cxx: clang++} + - adapter: {static_adapter: ON} + build_type: Release + - adapter: {static_adapter: ON} + compiler: {c: clang, cxx: clang++} - runs-on: ${{matrix.adapter.name}} + runs-on: ${{inputs.runner_name}} steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -71,10 +82,13 @@ jobs: -DUR_ENABLE_TRACING=ON -DUR_DEVELOPER_MODE=ON -DUR_BUILD_TESTS=ON + -DUR_CONFORMANCE_TEST_LOADER=OFF -DUR_BUILD_ADAPTER_${{matrix.adapter.name}}=ON -DUR_STATIC_LOADER=${{matrix.adapter.static_Loader}} + -DUR_STATIC_ADAPTER_${{matrix.adapter.name}}=${{matrix.adapter.static_adapter}} -DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++ -DUR_SYCL_LIBRARY_DIR=${{github.workspace}}/dpcpp_compiler/lib + -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/install ${{ matrix.adapter.name == 'HIP' && '-DUR_CONFORMANCE_AMD_ARCH=gfx1030' || '' }} ${{ matrix.adapter.name == 'HIP' && '-DUR_HIP_PLATFORM=AMD' || '' }} @@ -82,6 +96,10 @@ jobs: # This is so that device binaries can find the sycl runtime library run: cmake --build ${{github.workspace}}/build -j $(nproc) + - name: Install + # This is to check that install command does not fail + run: cmake --install ${{github.workspace}}/build + - name: Test adapter specific working-directory: ${{github.workspace}}/build run: ctest -C ${{matrix.build_type}} --output-on-failure -L "adapter-specific" --timeout 180 diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index d8d2479587..8c0e3bac86 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -20,6 +20,7 @@ jobs: compiler: [{c: gcc, cxx: g++}] libbacktrace: ['-DVAL_USE_LIBBACKTRACE_BACKTRACE=OFF'] pool_tracking: ['-DUMF_ENABLE_POOL_TRACKING=ON', '-DUMF_ENABLE_POOL_TRACKING=OFF'] + latency_tracking: ['-DUR_ENABLE_LATENCY_HISTOGRAM=OFF'] include: - os: 'ubuntu-22.04' build_type: Release @@ -36,8 +37,11 @@ jobs: - os: 'ubuntu-20.04' build_type: Release compiler: {c: gcc-7, cxx: g++-7} - - runs-on: ${{matrix.os}} + - os: 'ubuntu-22.04' + build_type: Release + compiler: {c: clang, cxx: clang++} + latency_tracking: '-DUR_ENABLE_LATENCY_HISTOGRAM=ON' + runs-on: ${{ (matrix.os == 'ubuntu-22.04' && github.repository_owner == 'oneapi-src') && 'intel-ubuntu-22.04' || matrix.os }} steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -45,16 +49,19 @@ jobs: - name: Install apt packages run: | sudo apt-get update - sudo apt-get install -y doxygen ${{matrix.compiler.c}} + sudo apt-get install -y ${{matrix.compiler.c}} devscripts + + - name: Install libhwloc + run: .github/scripts/install_hwloc.sh + + - name: Setup PATH + run: echo "$HOME/.local/bin" >> $GITHUB_PATH - name: Install g++-7 if: matrix.compiler.cxx == 'g++-7' run: | sudo apt-get install -y ${{matrix.compiler.cxx}} - - name: Install pip packages - run: pip install -r third_party/requirements.txt - - name: Install libbacktrace if: matrix.libbacktrace == '-DVAL_USE_LIBBACKTRACE_BACKTRACE=ON' run: | @@ -69,12 +76,13 @@ jobs: if: matrix.os == 'ubuntu-22.04' run: | sudo apt install libncurses5 - wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-01-29/sycl_linux.tar.gz + wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-09-27/sycl_linux.tar.gz mkdir -p ${{github.workspace}}/dpcpp_compiler tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz -C ${{github.workspace}}/dpcpp_compiler - name: Configure CMake if: matrix.os == 'ubuntu-22.04' + # WEXTRA: https://github.com/oneapi-src/unified-runtime/issues/2109 run: > cmake -B${{github.workspace}}/build @@ -83,13 +91,18 @@ jobs: -DUR_ENABLE_TRACING=ON -DCMAKE_BUILD_TYPE=${{matrix.build_type}} -DUR_BUILD_TESTS=ON - -DUR_FORMAT_CPP_STYLE=ON + -DUR_FORMAT_CPP_STYLE=OFF + -DUR_DEVELOPER_MODE=ON -DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++ + -DUR_CONFORMANCE_TEST_LOADER=OFF ${{matrix.libbacktrace}} ${{matrix.pool_tracking}} + ${{matrix.latency_tracking}} - name: Configure CMake if: matrix.os == 'ubuntu-20.04' + # WEXTRA: https://github.com/oneapi-src/unified-runtime/issues/2109 + # Note: Disable Werror, since 20.04 raises different ones than 22.04 run: > cmake -B${{github.workspace}}/build @@ -98,20 +111,20 @@ jobs: -DUR_ENABLE_TRACING=ON -DCMAKE_BUILD_TYPE=${{matrix.build_type}} -DUR_BUILD_TESTS=ON - -DUR_FORMAT_CPP_STYLE=ON + -DUR_FORMAT_CPP_STYLE=OFF + -DUR_DEVELOPER_MODE=OFF ${{matrix.libbacktrace}} ${{matrix.pool_tracking}} - - - name: Generate source from spec, check for uncommitted diff - if: matrix.os == 'ubuntu-22.04' - run: cmake --build ${{github.workspace}}/build --target check-generated - - - name: Verify that each source file contains a license - run: cmake --build ${{github.workspace}}/build --target verify-licenses + ${{matrix.latency_tracking}} - name: Build run: cmake --build ${{github.workspace}}/build -j $(nproc) + - name: Verify hardening flags have been set + run: cmake --build ${{github.workspace}}/build --target verify-hardening + # https://github.com/oneapi-src/unified-runtime/issues/2120 + if: ${{ matrix.compiler.cxx != 'clang++' && matrix.os != 'ubuntu-20.04' }} + - name: Test working-directory: ${{github.workspace}}/build run: ctest -C ${{matrix.build_type}} --output-on-failure -L "umf|loader|validation|tracing|unit|urtrace" @@ -126,39 +139,53 @@ jobs: name: Level Zero uses: ./.github/workflows/build-hw-reusable.yml with: - name: L0 + adapter_name: L0 + runner_name: L0 + + level-zero-v2: + name: Level Zero V2 + uses: ./.github/workflows/build-hw-reusable.yml + with: + adapter_name: L0_V2 + runner_name: L0 level-zero-static: name: Level Zero static uses: ./.github/workflows/build-hw-reusable.yml with: - name: L0 + adapter_name: L0 + runner_name: L0 static_loader: ON + static_adapter: ON opencl: name: OpenCL uses: ./.github/workflows/build-hw-reusable.yml with: - name: OPENCL + adapter_name: OPENCL + runner_name: OPENCL platform: "Intel(R) OpenCL" cuda: name: CUDA uses: ./.github/workflows/build-hw-reusable.yml with: - name: CUDA + adapter_name: CUDA + runner_name: CUDA hip: name: HIP uses: ./.github/workflows/build-hw-reusable.yml with: - name: HIP + adapter_name: HIP + runner_name: HIP native-cpu: name: Native CPU uses: ./.github/workflows/build-hw-reusable.yml with: - name: NATIVE_CPU + adapter_name: NATIVE_CPU + runner_name: NATIVE_CPU e2e-level-zero: name: E2E L0 @@ -190,59 +217,62 @@ jobs: matrix: os: ['windows-2019', 'windows-2022'] adapter: [ - {name: None, var: ''}, {name: L0, var: '-DUR_BUILD_ADAPTER_L0=ON'} + {name: None, var: ''}, {name: L0, var: '-DUR_BUILD_ADAPTER_L0=ON'}, + {name: None, var: ''}, {name: L0_V2, var: '-DUR_BUILD_ADAPTER_L0_V2=ON'}, + {name: L0, var: '-DUR_BUILD_ADAPTER_L0=ON -DUR_STATIC_ADAPTER_L0=ON'} ] # TODO: building level zero loader on windows-2019 and clang-cl is currently broken exclude: - os: 'windows-2019' adapter: {name: L0, var: '-DUR_BUILD_ADAPTER_L0=ON'} + - os: 'windows-2019' + adapter: {name: L0_V2, var: '-DUR_BUILD_ADAPTER_L0_V2=ON'} + - os: 'windows-2019' + adapter: {name: L0, var: '-DUR_BUILD_ADAPTER_L0=ON -DUR_STATIC_ADAPTER_L0=ON'} - adapter: {name: L0, var: '-DUR_BUILD_ADAPTER_L0=ON'} compiler: {c: clang-cl, cxx: clang-cl} + - adapter: {name: L0_V2, var: '-DUR_BUILD_ADAPTER_L0_V2=ON'} + compiler: {c: clang-cl, cxx: clang-cl} + - adapter: {name: L0, var: '-DUR_BUILD_ADAPTER_L0=ON -DUR_STATIC_ADAPTER_L0=ON'} + compiler: {c: clang-cl, cxx: clang-cl} build_type: [Debug, Release] compiler: [{c: cl, cxx: cl}, {c: clang-cl, cxx: clang-cl}] include: - compiler: {c: clang-cl, cxx: clang-cl} toolset: "-T ClangCL" + - os: 'windows-2022' + adapter: {name: L0, var: '-DUR_BUILD_ADAPTER_L0=ON -DUR_STATIC_ADAPTER_L0=ON'} + build_type: 'Release' + compiler: {c: cl, cxx: cl} + runs-on: ${{matrix.os}} steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 - with: - python-version: 3.9 - - - name: Install prerequisites - run: python3 -m pip install -r third_party/requirements.txt - - - name: Install doxygen - run: | - $WorkingDir = $PWD.Path - Invoke-WebRequest -Uri https://github.com/doxygen/doxygen/releases/download/Release_1_9_8/doxygen-1.9.8.windows.x64.bin.zip -OutFile "$WorkingDir\doxygen.zip" - Expand-Archive -Path "$WorkingDir\doxygen.zip" - Add-Content $env:GITHUB_PATH "$WorkingDir\doxygen" + - name: Install hwloc + run: vcpkg install hwloc:x64-windows - name: Configure CMake + env: + VCPKG_PATH: "C:/vcpkg/packages/hwloc_x64-windows" run: > cmake -B${{github.workspace}}/build ${{matrix.toolset}} + -DCMAKE_PREFIX_PATH="${{env.VCPKG_PATH}}" -DCMAKE_C_COMPILER=${{matrix.compiler.c}} -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}} -DCMAKE_POLICY_DEFAULT_CMP0094=NEW -DUR_ENABLE_TRACING=ON -DUR_DEVELOPER_MODE=ON -DUR_BUILD_TESTS=ON - -DUR_FORMAT_CPP_STYLE=ON + -DUR_FORMAT_CPP_STYLE=OFF + -DUR_CONFORMANCE_TEST_LOADER=OFF ${{matrix.adapter.var}} - # TODO: re-enable when check-generated is fixed for windows runners see #888 - # - name: Generate source from spec, check for uncommitted diff - # if: matrix.os == 'windows-2022' - # run: cmake --build ${{github.workspace}}/build --target check-generated --config ${{matrix.build_type}} - - name: Build all run: cmake --build ${{github.workspace}}/build --config ${{matrix.build_type}} -j $Env:NUMBER_OF_PROCESSORS @@ -254,7 +284,7 @@ jobs: name: Build - MacOS strategy: matrix: - os: ['macos-12', 'macos-13'] + os: ['macos-13'] runs-on: ${{matrix.os}} steps: @@ -267,6 +297,9 @@ jobs: - name: Install prerequisites run: python3 -m pip install -r third_party/requirements.txt + - name: Install hwloc + run: brew install hwloc + - name: Configure CMake run: > cmake diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index b0ed45d6b5..67e810f687 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -12,7 +12,7 @@ permissions: jobs: analyze-ubuntu: name: Analyze on Ubuntu - runs-on: ubuntu-latest + runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }} permissions: security-events: write @@ -27,10 +27,16 @@ jobs: uses: github/codeql-action/init@f079b8493333aace61c81488f8bd40919487bd9f # v3.25.7 with: languages: cpp, python + config-file: ./.github/codeql/codeql-config.yml - name: Install pip packages run: pip install -r third_party/requirements.txt + - name: Install apt packages + run: | + sudo apt-get update + sudo apt-get install -y libhwloc-dev + - name: Configure CMake run: cmake -B ${{github.workspace}}/build -DUR_DEVELOPER_MODE=ON -DUR_BUILD_TESTS=ON -DUR_ENABLE_TRACING=ON -DUR_BUILD_TOOLS=ON -DUMF_ENABLE_POOL_TRACKING=ON @@ -61,8 +67,13 @@ jobs: - name: Install pip packages run: python3 -m pip install -r third_party/requirements.txt + - name: Install hwloc + run: vcpkg install hwloc:x64-windows + - name: Configure CMake - run: cmake -B ${{github.workspace}}/build -DCMAKE_POLICY_DEFAULT_CMP0094=NEW -DUR_DEVELOPER_MODE=ON -DUR_BUILD_TESTS=ON -DUR_ENABLE_TRACING=ON -DUR_BUILD_TOOLS=ON -DUMF_ENABLE_POOL_TRACKING=ON + env: + VCPKG_PATH: "C:/vcpkg/packages/hwloc_x64-windows" + run: cmake -B ${{github.workspace}}/build -DCMAKE_POLICY_DEFAULT_CMP0094=NEW -DUR_DEVELOPER_MODE=ON -DUR_BUILD_TESTS=ON -DUR_ENABLE_TRACING=ON -DUR_BUILD_TOOLS=ON -DUMF_ENABLE_POOL_TRACKING=ON -DCMAKE_PREFIX_PATH="${{env.VCPKG_PATH}}" - name: Build run: cmake --build ${{github.workspace}}/build -j $(nproc) --config Release diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml index ff6fdf6fde..ba0230d600 100644 --- a/.github/workflows/coverity.yml +++ b/.github/workflows/coverity.yml @@ -1,5 +1,5 @@ # -# Copyright (C) 2023 Intel Corporation +# Copyright (C) 2023-2024 Intel Corporation # # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. # See LICENSE.TXT diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 0704038829..710aa659c8 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -23,7 +23,7 @@ concurrency: jobs: # Build job build: - runs-on: ubuntu-latest + runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }} steps: - name: Checkout uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -57,7 +57,7 @@ jobs: environment: name: github-pages url: ${{ steps.deployment.outputs.page_url }} - runs-on: ubuntu-latest + runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }} needs: build steps: - name: Deploy to GitHub Pages diff --git a/.github/workflows/e2e_core.yml b/.github/workflows/e2e_core.yml index 0621093169..32b8d58e7a 100644 --- a/.github/workflows/e2e_core.yml +++ b/.github/workflows/e2e_core.yml @@ -54,7 +54,7 @@ permissions: jobs: changed-files: name: Check for changed files - runs-on: ubuntu-22.04 + runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }} outputs: any_changed: ${{ steps.get-changed.outputs.any_changed }} steps: @@ -65,6 +65,8 @@ jobs: with: files: | source/adapters/${{inputs.str_name}}/** + source/loader/** + .github/workflows/e2e* e2e-build-hw: # We want to run the job only if there are changes in the specific adapter @@ -128,8 +130,8 @@ jobs: --ci-defaults ${{matrix.adapter.config}} --cmake-opt="-DLLVM_INSTALL_UTILS=ON" --cmake-opt="-DSYCL_PI_TESTS=OFF" - --cmake-opt="-DSYCL_PI_UR_USE_FETCH_CONTENT=OFF" - --cmake-opt="-DSYCL_PI_UR_SOURCE_DIR=${{github.workspace}}/ur-repo/" + --cmake-opt="-DSYCL_UR_USE_FETCH_CONTENT=OFF" + --cmake-opt="-DSYCL_UR_SOURCE_DIR=${{github.workspace}}/ur-repo/" --cmake-opt=-DCMAKE_C_COMPILER_LAUNCHER=ccache --cmake-opt=-DCMAKE_CXX_COMPILER_LAUNCHER=ccache @@ -155,7 +157,7 @@ jobs: - name: Setup SYCL variables run: | which clang++ sycl-ls - SYCL_PI_TRACE=-1 sycl-ls + SYCL_UR_TRACE=-1 sycl-ls - name: Build e2e tests run: > @@ -167,17 +169,24 @@ jobs: -DCMAKE_CXX_COMPILER="$(which clang++)" -DLLVM_LIT="${{github.workspace}}/sycl-repo/llvm/utils/lit/lit.py" - - name: Set test filters for L0 - if: matrix.adapter.name == 'L0' - run: | - echo "LIT_XFAIL_NOT=${{inputs.xfail_not}}" >> $GITHUB_ENV - echo "LIT_XFAIL=${{inputs.xfail}}" >> $GITHUB_ENV - echo "LIT_FILTER_OUT=${{inputs.filter_out}}" >> $GITHUB_ENV + - name: Set LIT_XFAIL + if: inputs.xfail != '' + run: echo "LIT_XFAIL=${{inputs.xfail}}" >> $GITHUB_ENV + + - name: Set LIT_FILTER_OUT + if: inputs.filter_out != '' + run: echo "LIT_FILTER_OUT=${{inputs.filter_out}}" >> $GITHUB_ENV + + - name: Set LIT_XFAIL_NOT + if: inputs.xfail_not != '' + run: echo "LIT_XFAIL_NOT=${{inputs.xfail_not}}" >> $GITHUB_ENV # TODO: remove once intel/llvm lit tests can properly recognize the GPU - name: Configure hardware platform feature for L0 if: matrix.adapter.name == 'L0' - run: sed -i '/import lit.llvm/i config.available_features.add("gpu-intel-pvc-1T")' build-e2e/lit.site.cfg.py + run: | + sed -i '/import lit.llvm/i config.available_features.add("gpu-intel-pvc-1T")' build-e2e/lit.site.cfg.py + sed -i '/import lit.llvm/i config.available_features.add("gpu-intel-pvc")' build-e2e/lit.site.cfg.py - name: Run e2e tests id: tests diff --git a/.github/workflows/e2e_cuda.yml b/.github/workflows/e2e_cuda.yml index 6bf181b0a4..c2f1d969b8 100644 --- a/.github/workflows/e2e_cuda.yml +++ b/.github/workflows/e2e_cuda.yml @@ -20,3 +20,5 @@ jobs: prefix: "ext_oneapi_" config: "--cuda" unit: "gpu" + extra_lit_flags: "-sv --max-time=3600" + xfail: "Regression/device_num.cpp" diff --git a/.github/workflows/e2e_level_zero.yml b/.github/workflows/e2e_level_zero.yml index 39f4a3082c..1fd814f271 100644 --- a/.github/workflows/e2e_level_zero.yml +++ b/.github/workflows/e2e_level_zero.yml @@ -21,9 +21,11 @@ jobs: config: "" unit: "gpu" # Failing tests - xfail: "Matrix/SG32/get_coord_int8_matB.cpp;Matrix/element_wise_all_ops_1d.cpp;Matrix/element_wise_all_ops_1d_cont.cpp;Matrix/element_wise_all_ops_scalar.cpp;Matrix/get_coord_int8_matB.cpp;Matrix/joint_matrix_rowmajorA_rowmajorB.cpp;Plugin/level_zero_barrier_optimization.cpp" + xfail: "InvokeSimd/Regression/call_vadd_1d_spill.cpp;InvokeSimd/Regression/ImplicitSubgroup/call_vadd_1d_spill.cpp;ESIMD/mask_expand_load.cpp;Matrix/joint_matrix_prefetch.cpp;ESIMD/mask_expand_load.cpp;Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp;Matrix/joint_matrix_bf16_fill_k_cache_prefetch.cpp;Matrix/SPVCooperativeMatrix/element_wise_ops.cpp;" + # Unexpectedly Passed Tests + xfail_not: "" # Flaky tests - filter_out: "UserDefinedReductions/user_defined_reductions.cpp" + filter_out: "Basic/accessor/accessor.cpp|DeviceArchitecture/device_architecture_comparison_on_device_aot.cpp|Graph/Explicit/interop-level-zero-launch-kernel.cpp|Graph/RecordReplay/interop-level-zero-launch-kernel.cpp|syclcompat/launch/launch_policy_lmem.cpp" # These runners by default spawn upwards of 260 workers. # We also add a time out just in case some test hangs - extra_lit_flags: "--param gpu-intel-pvc=True -sv -j 100 --max-time 600" + extra_lit_flags: "--param gpu-intel-pvc=True --param gpu-intel-pvc-1T=True -sv -j 100 --max-time=3600" diff --git a/.github/workflows/e2e_opencl.yml b/.github/workflows/e2e_opencl.yml index 5264b668f2..e4714b2434 100644 --- a/.github/workflows/e2e_opencl.yml +++ b/.github/workflows/e2e_opencl.yml @@ -20,3 +20,5 @@ jobs: prefix: "" config: "" unit: "cpu" + xfail: "AOT/double.cpp;AOT/half.cpp;AOT/reqd-sg-size.cpp;Basic/built-ins/marray_geometric.cpp;KernelCompiler/kernel_compiler_spirv.cpp;KernelCompiler/opencl_queries.cpp;NonUniformGroups/ballot_group.cpp;NonUniformGroups/ballot_group_algorithms.cpp;NonUniformGroups/fixed_size_group_algorithms.cpp;NonUniformGroups/opportunistic_group.cpp;NonUniformGroups/opportunistic_group_algorithms.cpp;NonUniformGroups/tangle_group.cpp;NonUniformGroups/tangle_group_algorithms.cpp" + extra_lit_flags: "-sv --max-time=3600" diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index d0cb335d96..faf7060503 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -18,6 +18,6 @@ jobs: permissions: contents: read pull-requests: write - runs-on: ubuntu-latest + runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }} steps: - uses: actions/labeler@8558fd74291d67161a8a78ce36a881fa63b766a9 # v5.0.0 diff --git a/.github/workflows/multi_device.yml b/.github/workflows/multi_device.yml index b9ae70ece6..ebdb982148 100644 --- a/.github/workflows/multi_device.yml +++ b/.github/workflows/multi_device.yml @@ -17,7 +17,8 @@ jobs: strategy: matrix: adapter: [ - {name: L0} + {name: L0}, + {name: L0_V2} ] build_type: [Debug, Release] compiler: [{c: gcc, cxx: g++}] # TODO: investigate why memory-adapter-level_zero hangs with clang @@ -30,12 +31,11 @@ jobs: - name: Install pip packages run: pip install -r third_party/requirements.txt - # TODO: enable once test failure are fixed/ignored - # - name: Download DPC++ - # run: | - # wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-01-29/sycl_linux.tar.gz - # mkdir dpcpp_compiler - # tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz -C dpcpp_compiler + - name: Download DPC++ + run: | + wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-01-29/sycl_linux.tar.gz + mkdir dpcpp_compiler + tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz -C dpcpp_compiler - name: Configure CMake shell: bash -el {0} @@ -48,7 +48,10 @@ jobs: -DUR_DEVELOPER_MODE=ON -DUR_BUILD_TESTS=ON -DUR_BUILD_ADAPTER_${{matrix.adapter.name}}=ON + -DUR_CONFORMANCE_TEST_LOADER=OFF -DUR_TEST_DEVICES_COUNT=2 + -DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++ + -DUR_SYCL_LIBRARY_DIR=${{github.workspace}}/dpcpp_compiler/lib - name: Build run: cmake --build ${{github.workspace}}/build -j $(nproc) @@ -60,4 +63,4 @@ jobs: - name: Test adapters working-directory: ${{github.workspace}}/build - run: env UR_CTS_ADAPTER_PLATFORM="${{matrix.adapter.platform}}" ctest -C ${{matrix.build_type}} --output-on-failure -L "conformance" --timeout 180 + run: env UR_CTS_ADAPTER_PLATFORM="${{matrix.adapter.platform}}" ctest -C ${{matrix.build_type}} --output-on-failure -L "conformance" -E "enqueue|kernel|program|integration|exp_command_buffer|exp_enqueue_native|exp_launch_properties|exp_usm_p2p" --timeout 180 diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 6c14faf7df..06d4026676 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -1,6 +1,7 @@ name: Nightly on: + workflow_dispatch: schedule: # Run every day at 23:00 UTC - cron: '0 23 * * *' diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml index f1436fc46a..f466cc693e 100644 --- a/.github/workflows/prerelease.yml +++ b/.github/workflows/prerelease.yml @@ -11,7 +11,7 @@ permissions: jobs: weekly-prerelease: - runs-on: ubuntu-latest + runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }} permissions: contents: write steps: diff --git a/.github/workflows/source-checks.yml b/.github/workflows/source-checks.yml new file mode 100644 index 0000000000..e73f403320 --- /dev/null +++ b/.github/workflows/source-checks.yml @@ -0,0 +1,71 @@ +name: Source Checks + +on: [push, pull_request] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + source-checks: + name: Source Checks + strategy: + matrix: + os: ['ubuntu-22.04', 'windows-2022'] + + runs-on: ${{matrix.os}} + + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + + - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + with: + python-version: 3.9 + + - name: Install pip packages + run: pip install -r third_party/requirements.txt + + - name: "[Lin] Install doxygen" + if: matrix.os == 'ubuntu-22.04' + run: | + sudo apt-get update + sudo apt-get install -y doxygen + + - name: "[Win] Install doxygen" + if: matrix.os == 'windows-2022' + run: | + $WorkingDir = $PWD.Path + Invoke-WebRequest -Uri https://github.com/doxygen/doxygen/releases/download/Release_1_9_8/doxygen-1.9.8.windows.x64.bin.zip -OutFile "$WorkingDir\doxygen.zip" + Expand-Archive -Path "$WorkingDir\doxygen.zip" + Add-Content $env:GITHUB_PATH "$WorkingDir\doxygen" + + - name: "[Lin] Install hwloc" + if: matrix.os == 'ubuntu-22.04' + run: .github/scripts/install_hwloc.sh + + - name: "[Win] Install hwloc" + if: matrix.os == 'windows-2022' + run: vcpkg install hwloc:x64-windows + + - name: Configure CMake + env: + VCPKG_PATH: "C:/vcpkg/packages/hwloc_x64-windows" + run: > + cmake + -B${{github.workspace}}/build + -DCMAKE_PREFIX_PATH="${{env.VCPKG_PATH}}" + -DUR_ENABLE_TRACING=OFF + -DCMAKE_BUILD_TYPE=Debug + -DUR_BUILD_TESTS=OFF + -DUR_FORMAT_CPP_STYLE=ON + + # Verifying license should be enough on a single OS + - name: Verify that each source file contains a license + if: matrix.os == 'ubuntu-22.04' + run: cmake --build ${{github.workspace}}/build --target verify-licenses + + - name: Generate source from spec, check for uncommitted diff + run: cmake --build ${{github.workspace}}/build --target check-generated diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml index 2efb04c86a..d20b8d25be 100644 --- a/.github/workflows/trivy.yml +++ b/.github/workflows/trivy.yml @@ -19,7 +19,7 @@ permissions: jobs: linux: name: Trivy - runs-on: ubuntu-latest + runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }} permissions: security-events: write @@ -35,6 +35,7 @@ jobs: format: 'sarif' output: 'trivy-results.sarif' exit-code: 1 # Fail if issue found + skip-dirs: '**/_deps/**' # file with suppressions: .trivyignore (in root dir) - name: Print report and trivyignore file diff --git a/CMakeLists.txt b/CMakeLists.txt index a5870ddde0..4e970e2759 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,12 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception cmake_minimum_required(VERSION 3.20.0 FATAL_ERROR) -project(unified-runtime VERSION 0.10.0) +project(unified-runtime VERSION 0.11.0) + +# Check if unified runtime is built as a standalone project. +if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR UR_STANDALONE_BUILD) + set(UR_STANDALONE_BUILD TRUE) +endif() include(GNUInstallDirs) include(CheckCXXSourceCompiles) @@ -29,7 +34,7 @@ option(UR_BUILD_EXAMPLES "Build example applications." ON) option(UR_BUILD_TESTS "Build unit tests." ON) option(UR_BUILD_TOOLS "build ur tools" ON) option(UR_FORMAT_CPP_STYLE "format code style of C++ sources" OFF) -option(UR_DEVELOPER_MODE "enable developer checks, treats warnings as errors" OFF) +option(UR_DEVELOPER_MODE "treats warnings as errors" OFF) option(UR_ENABLE_FAST_SPEC_MODE "enable fast specification generation mode" OFF) option(UR_USE_ASAN "enable AddressSanitizer" OFF) option(UR_USE_UBSAN "enable UndefinedBehaviorSanitizer" OFF) @@ -37,7 +42,8 @@ option(UR_USE_MSAN "enable MemorySanitizer" OFF) option(UR_USE_TSAN "enable ThreadSanitizer" OFF) option(UR_ENABLE_TRACING "enable api tracing through xpti" OFF) option(UR_ENABLE_SANITIZER "enable device sanitizer" ON) -option(UMF_BUILD_SHARED_LIBRARY "Build UMF as shared library" OFF) +option(UR_ENABLE_SYMBOLIZER "enable symoblizer for sanitizer" OFF) +option(UMF_BUILD_SHARED_LIBRARY "Build UMF as shared library" ON) option(UMF_ENABLE_POOL_TRACKING "Build UMF with pool tracking" ON) option(UR_BUILD_ADAPTER_L0 "Build the Level-Zero adapter" OFF) option(UR_BUILD_ADAPTER_OPENCL "Build the OpenCL adapter" OFF) @@ -45,11 +51,15 @@ option(UR_BUILD_ADAPTER_CUDA "Build the CUDA adapter" OFF) option(UR_BUILD_ADAPTER_HIP "Build the HIP adapter" OFF) option(UR_BUILD_ADAPTER_NATIVE_CPU "Build the Native-CPU adapter" OFF) option(UR_BUILD_ADAPTER_ALL "Build all currently supported adapters" OFF) +option(UR_BUILD_ADAPTER_L0_V2 "Build the (experimental) Level-Zero v2 adapter" OFF) +option(UR_STATIC_ADAPTER_L0 "Build the Level-Zero adapter as static and embed in the loader" OFF) option(UR_BUILD_EXAMPLE_CODEGEN "Build the codegen example." OFF) option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace for linux" OFF) option(UR_ENABLE_ASSERTIONS "Enable assertions for all build types" OFF) option(UR_BUILD_XPTI_LIBS "Build the XPTI libraries when tracing is enabled" ON) option(UR_STATIC_LOADER "Build loader as a static library" OFF) +option(UR_FORCE_LIBSTDCXX "Force use of libstdc++ in a build using libc++ on Linux" OFF) +option(UR_ENABLE_LATENCY_HISTOGRAM "Enable latncy histogram" OFF) set(UR_DPCXX "" CACHE FILEPATH "Path of the DPC++ compiler executable") set(UR_DPCXX_BUILD_FLAGS "" CACHE STRING "Build flags to pass to DPC++ when compiling device programs") set(UR_SYCL_LIBRARY_DIR "" CACHE PATH @@ -58,6 +68,7 @@ set(UR_CONFORMANCE_TARGET_TRIPLES "" CACHE STRING "List of sycl targets to build CTS device binaries for") set(UR_CONFORMANCE_AMD_ARCH "" CACHE STRING "AMD device target ID to build CTS binaries for") option(UR_CONFORMANCE_ENABLE_MATCH_FILES "Enable CTS match files" ON) +option(UR_CONFORMANCE_TEST_LOADER "Also test the loader in the conformance tests" OFF) set(UR_ADAPTER_LEVEL_ZERO_SOURCE_DIR "" CACHE PATH "Path to external 'level_zero' adapter source dir") set(UR_ADAPTER_OPENCL_SOURCE_DIR "" CACHE PATH @@ -88,6 +99,21 @@ if(CMAKE_SYSTEM_NAME STREQUAL Windows AND NOT CMAKE_GENERATOR STREQUAL Ninja) set(CUSTOM_COMMAND_BINARY_DIR ${CUSTOM_COMMAND_BINARY_DIR}/$) endif() +if(UR_FORCE_LIBSTDCXX AND CMAKE_SYSTEM_NAME STREQUAL Linux) + # Remove flags to specify using libc++ or static libstdc++ in order to + # support sitatuions where the libstdc++ ABI is required. + foreach(flags CMAKE_CXX_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS) + string(REPLACE "-stdlib=libc++" "" ${flags} "${${flags}}") + string(REPLACE "-static-libstdc++" "" ${flags} "${${flags}}") + endforeach() + # Globally link against pthread, this is necessary when forcing use of + # libstdc++ in a libc++ build as the FindThreads module may have already + # been invoked and detected that pthread symbols are provided by libc++ + # which is not the case for libstdc++. + add_compile_options(-pthread) + link_libraries(pthread) +endif() + if(NOT MSVC) # Determine if libstdc++ is being used. check_cxx_source_compiles(" @@ -97,7 +123,7 @@ if(NOT MSVC) #endif int main() {}" USING_LIBSTDCXX) - if(USING_LIBSTDCXX) + if(UR_FORCE_LIBSTDCXX OR USING_LIBSTDCXX) # Support older versions of GCC where the header is not # available and must be used instead. This # requires linking against libstdc++fs.a, on systems where @@ -136,6 +162,12 @@ if(UR_ENABLE_TRACING) LIBRARY_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY} ) + if (NOT MSVC) + # Hardening flags cause issues on Windows + add_ur_target_compile_options(xptifw) + add_ur_target_link_options(xptifw) + endif() + if (UR_STATIC_LOADER) install(TARGETS xpti xptifw EXPORT ${PROJECT_NAME}-targets @@ -163,6 +195,14 @@ if(UR_ENABLE_SANITIZER) else() add_compile_definitions(UR_ENABLE_SANITIZER) endif() + + if(UR_ENABLE_SYMBOLIZER AND UR_STANDALONE_BUILD) + find_package(LLVM REQUIRED) + endif() +else() + if(UR_ENABLE_SYMBOLIZER) + message(FATAL_ERROR "Symbolizer must be enabled with Sanitizer layer") + endif() endif() if(UR_USE_ASAN) @@ -185,6 +225,14 @@ if(UR_USE_MSAN) add_sanitizer_flag(memory) endif() +if(NOT (UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_HIP + OR UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_OPENCL + OR UR_BUILD_ADAPTER_NATIVE_CPU OR UR_BUILD_ADAPTER_L0_V2 + OR UR_BUILD_ADAPTER_ALL)) + message(WARNING "No adapters have been enabled; conformance tests will not be ran") + message(STATUS "Consider setting UR_BUILD_ADAPTER_*") +endif() + # Check if clang-format (in correct version) is available for Cpp code formatting. if(UR_FORMAT_CPP_STYLE) find_program(CLANG_FORMAT NAMES clang-format-15 clang-format-15.0 clang-format) @@ -236,6 +284,13 @@ add_custom_target(verify-licenses COMMENT "Verify all files contain a license." ) +# Add hardening check +add_custom_target(verify-hardening + COMMAND "${PROJECT_SOURCE_DIR}/scripts/check-hardening.sh" + ${CMAKE_BINARY_DIR} + COMMENT "Check hardening settings on built binaries and libraries" +) + # Add code formatter target add_custom_target(cppformat) # ... and all source files to the formatter diff --git a/README.md b/README.md index ae61b76b09..262a861b9d 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ # Unified Runtime [![Build and test](https://github.com/oneapi-src/unified-runtime/actions/workflows/cmake.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/cmake.yml) -[![E2E Cuda](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_cuda.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_cuda.yml) -[![E2E OpenCL](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_opencl.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_opencl.yml) -[![E2E Level Zero](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_level_zero.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_level_zero.yml) -[![CodeQL](https://github.com/oneapi-src/unified-runtime/actions/workflows/codeql.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/codeql.yml) [![Bandit](https://github.com/oneapi-src/unified-runtime/actions/workflows/bandit.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/bandit.yml) +[![CodeQL](https://github.com/oneapi-src/unified-runtime/actions/workflows/codeql.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/codeql.yml) [![Coverity](https://scan.coverity.com/projects/28213/badge.svg)](https://scan.coverity.com/projects/oneapi-src-unified-runtime) +[![Nightly](https://github.com/oneapi-src/unified-runtime/actions/workflows/nightly.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/nightly.yml) [![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/oneapi-src/unified-runtime/badge)](https://securityscorecards.dev/viewer/?uri=github.com/oneapi-src/unified-runtime) +[![Trivy](https://github.com/oneapi-src/unified-runtime/actions/workflows/trivy.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/trivy.yml) +[![Deploy documentation to Pages](https://github.com/oneapi-src/unified-runtime/actions/workflows/docs.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/docs.yml) @@ -122,7 +122,7 @@ List of options provided by CMake: | UR_BUILD_TESTS | Build the tests | ON/OFF | ON | | UR_BUILD_TOOLS | Build tools | ON/OFF | ON | | UR_FORMAT_CPP_STYLE | Format code style | ON/OFF | OFF | -| UR_DEVELOPER_MODE | Treat warnings as errors and enables additional checks | ON/OFF | OFF | +| UR_DEVELOPER_MODE | Treat warnings as errors | ON/OFF | OFF | | UR_ENABLE_FAST_SPEC_MODE | Enable fast specification generation mode | ON/OFF | OFF | | UR_USE_ASAN | Enable AddressSanitizer | ON/OFF | OFF | | UR_USE_TSAN | Enable ThreadSanitizer | ON/OFF | OFF | @@ -133,25 +133,35 @@ List of options provided by CMake: | UR_CONFORMANCE_TARGET_TRIPLES | SYCL triples to build CTS device binaries for | Comma-separated list | spir64 | | UR_CONFORMANCE_AMD_ARCH | AMD device target ID to build CTS binaries for | string | `""` | | UR_CONFORMANCE_ENABLE_MATCH_FILES | Enable CTS match files | ON/OFF | ON | +| UR_CONFORMANCE_TEST_LOADER | Additionally build and run "loader" tests for the CTS | ON/OFF | OFF | | UR_BUILD_ADAPTER_L0 | Build the Level-Zero adapter | ON/OFF | OFF | | UR_BUILD_ADAPTER_OPENCL | Build the OpenCL adapter | ON/OFF | OFF | | UR_BUILD_ADAPTER_CUDA | Build the CUDA adapter | ON/OFF | OFF | | UR_BUILD_ADAPTER_HIP | Build the HIP adapter | ON/OFF | OFF | | UR_BUILD_ADAPTER_NATIVE_CPU | Build the Native-CPU adapter | ON/OFF | OFF | | UR_BUILD_ADAPTER_ALL | Build all currently supported adapters | ON/OFF | OFF | +| UR_BUILD_ADAPTER_L0_V2 | Build the (experimental) Level-Zero v2 adapter | ON/OFF | OFF | +| UR_STATIC_ADAPTER_L0 | Build the Level-Zero adapter as static and embed in the loader | ON/OFF | OFF | | UR_HIP_PLATFORM | Build HIP adapter for AMD or NVIDIA platform | AMD/NVIDIA | AMD | | UR_ENABLE_COMGR | Enable comgr lib usage | AMD/NVIDIA | AMD | | UR_DPCXX | Path of the DPC++ compiler executable to build CTS device binaries | File path | `""` | | UR_DEVICE_CODE_EXTRACTOR | Path of the `clang-offload-extract` executable from the DPC++ package, required for CTS device binaries | File path | `"${dirname(UR_DPCXX)}/clang-offload-extract"` | | UR_DPCXX_BUILD_FLAGS | Build flags to pass to DPC++ when compiling device programs | Space-separated options list | `""` | | UR_SYCL_LIBRARY_DIR | Path of the SYCL runtime library directory to build CTS device binaries | Directory path | `""` | -| UR_HIP_ROCM_DIR | Path of the default ROCm HIP installation | Directory path | `/opt/rocm` | +| UR_HIP_ROCM_DIR | Path of the default ROCm HIP installation | Directory path | `$ENV{ROCM_PATH}` or `/opt/rocm` | | UR_HIP_INCLUDE_DIR | Path of the ROCm HIP include directory | Directory path | `${UR_HIP_ROCM_DIR}/include` | | UR_HIP_HSA_INCLUDE_DIRS | Path of the ROCm HSA include directory | Directory path | `${UR_HIP_ROCM_DIR}/hsa/include;${UR_HIP_ROCM_DIR}/include` | | UR_HIP_LIB_DIR | Path of the ROCm HIP library directory | Directory path | `${UR_HIP_ROCM_DIR}/lib` | ### Additional make targets +To run tests, do the following: + +```bash +$ make +$ make test +``` + To run automated code formatting, configure CMake with `UR_FORMAT_CPP_STYLE` option and then run a custom `cppformat` target: diff --git a/cmake/FetchLevelZero.cmake b/cmake/FetchLevelZero.cmake new file mode 100644 index 0000000000..9e3c3f704f --- /dev/null +++ b/cmake/FetchLevelZero.cmake @@ -0,0 +1,94 @@ +# Copyright (C) 2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +set(UR_LEVEL_ZERO_LOADER_LIBRARY "" CACHE FILEPATH "Path of the Level Zero Loader library") +set(UR_LEVEL_ZERO_INCLUDE_DIR "" CACHE FILEPATH "Directory containing the Level Zero Headers") +set(UR_LEVEL_ZERO_LOADER_REPO "" CACHE STRING "Github repo to get the Level Zero loader sources from") +set(UR_LEVEL_ZERO_LOADER_TAG "" CACHE STRING " GIT tag of the Level Loader taken from github repo") + +# Copy Level Zero loader/headers locally to the build to avoid leaking their path. +set(LEVEL_ZERO_COPY_DIR ${CMAKE_CURRENT_BINARY_DIR}/level_zero_loader) +if (NOT UR_LEVEL_ZERO_LOADER_LIBRARY STREQUAL "") + get_filename_component(LEVEL_ZERO_LIB_NAME "${UR_LEVEL_ZERO_LOADER_LIBRARY}" NAME) + set(LEVEL_ZERO_LIBRARY ${LEVEL_ZERO_COPY_DIR}/${LEVEL_ZERO_LIB_NAME}) + message(STATUS "Level Zero Adapter: Copying Level Zero loader to local build tree") + file(COPY ${UR_LEVEL_ZERO_LOADER_LIBRARY} DESTINATION ${LEVEL_ZERO_COPY_DIR} FOLLOW_SYMLINK_CHAIN) +endif() +if (NOT UR_LEVEL_ZERO_INCLUDE_DIR STREQUAL "") + set(LEVEL_ZERO_INCLUDE_DIR ${LEVEL_ZERO_COPY_DIR}) + message(STATUS "Level Zero Adapter: Copying Level Zero headers to local build tree") + file(COPY ${UR_LEVEL_ZERO_INCLUDE_DIR}/ DESTINATION ${LEVEL_ZERO_COPY_DIR}) +endif() + +if (NOT DEFINED LEVEL_ZERO_LIBRARY OR NOT DEFINED LEVEL_ZERO_INCLUDE_DIR) + message(STATUS "Level Zero Adapter: Download Level Zero loader and headers from github.com") + + # Workaround warnings/errors for Level Zero build + set(CMAKE_CXX_FLAGS_BAK "${CMAKE_CXX_FLAGS}") + if (UNIX) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-but-set-variable") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-pedantic") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-stringop-truncation") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-c++98-compat-extra-semi") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-warning-option") + endif() + + if (UR_LEVEL_ZERO_LOADER_REPO STREQUAL "") + set(UR_LEVEL_ZERO_LOADER_REPO "https://github.com/oneapi-src/level-zero.git") + endif() + if (UR_LEVEL_ZERO_LOADER_TAG STREQUAL "") + set(UR_LEVEL_ZERO_LOADER_TAG v1.18.3) + endif() + + # Disable due to a bug https://github.com/oneapi-src/level-zero/issues/104 + set(CMAKE_INCLUDE_CURRENT_DIR OFF) + # Prevent L0 loader from exporting extra symbols + set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS OFF) + + message(STATUS "Level Zero Adapter: Will fetch Level Zero Loader from ${UR_LEVEL_ZERO_LOADER_REPO}") + include(FetchContent) + FetchContent_Declare(level-zero-loader + GIT_REPOSITORY ${UR_LEVEL_ZERO_LOADER_REPO} + GIT_TAG ${UR_LEVEL_ZERO_LOADER_TAG} + ) + if(MSVC) + set(USE_Z7 ON) + endif() + FetchContent_MakeAvailable(level-zero-loader) + FetchContent_GetProperties(level-zero-loader) + + # Restore original flags + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS_BAK}") + + target_compile_options(ze_loader PRIVATE + $<$,GNU;Clang;Intel;IntelLLVM>:-Wno-error> + $<$:/WX- /UUNICODE> + ) + + set(LEVEL_ZERO_LIBRARY ze_loader) + set(LEVEL_ZERO_INCLUDE_DIR + ${level-zero-loader_SOURCE_DIR}/include CACHE PATH "Path to Level Zero Headers") +endif() + +add_library(LevelZeroLoader INTERFACE) +# The MSVC linker does not like / at the start of a path, so to work around this +# we split it into a link library and a library path, where the path is allowed +# to have leading /. +get_filename_component(LEVEL_ZERO_LIBRARY_SRC "${LEVEL_ZERO_LIBRARY}" DIRECTORY) +get_filename_component(LEVEL_ZERO_LIB_NAME "${LEVEL_ZERO_LIBRARY}" NAME) +target_link_directories(LevelZeroLoader + INTERFACE "$" + "$" +) +target_link_libraries(LevelZeroLoader + INTERFACE "${LEVEL_ZERO_LIB_NAME}" +) + +add_library(LevelZeroLoader-Headers INTERFACE) +target_include_directories(LevelZeroLoader-Headers + INTERFACE "$" + "$" +) diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake index 24cb6f8e54..8b40da7d4c 100644 --- a/cmake/helpers.cmake +++ b/cmake/helpers.cmake @@ -1,4 +1,4 @@ -# Copyright (C) 2023 Intel Corporation +# Copyright (C) 2023-2024 Intel Corporation # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception @@ -28,13 +28,14 @@ function(add_cppformat name) if(${ARGC} EQUAL 0) return() else() + # Split args into 2 parts (in Windows the list is probably too long) + list(SUBLIST ARGN 0 250 selected_files_1) + list(SUBLIST ARGN 251 -1 selected_files_2) add_custom_target(cppformat-${name} - COMMAND ${CLANG_FORMAT} - --style=file - --i - ${ARGN} + COMMAND ${CLANG_FORMAT} --style=file --i ${selected_files_1} + COMMAND ${CLANG_FORMAT} --style=file --i ${selected_files_2} COMMENT "Format CXX source files" - ) + ) endif() add_dependencies(cppformat cppformat-${name}) @@ -57,26 +58,42 @@ macro(add_sanitizer_flag flag) set(CMAKE_REQUIRED_LIBRARIES ${SAVED_CMAKE_REQUIRED_LIBRARIES}) endmacro() +check_cxx_compiler_flag("-fcf-protection=full" CXX_HAS_FCF_PROTECTION_FULL) + function(add_ur_target_compile_options name) if(NOT MSVC) + target_compile_definitions(${name} PRIVATE -D_FORTIFY_SOURCE=2) target_compile_options(${name} PRIVATE - -fPIC + # Warning options -Wall -Wpedantic -Wempty-body + -Wformat + -Wformat-security -Wunused-parameter + + # Hardening options + -fPIC + -fstack-protector-strong + -fvisibility=hidden # Required for -fsanitize=cfi + # -fsanitize=cfi requires -flto, which breaks a lot of things + # See: https://github.com/oneapi-src/unified-runtime/issues/2120 + # -flto + # $<$:-fsanitize=cfi> + $<$:-fcf-protection=full> + # -fstack-clash-protection is not supported in apple clang or GCC < 8 + $<$,$,8>>:-fstack-clash-protection> + $<$:-fstack-clash-protection> + + # Colored output $<$:-fdiagnostics-color=always> $<$:-fcolor-diagnostics> ) - if (CMAKE_BUILD_TYPE STREQUAL "Release") - target_compile_definitions(${name} PRIVATE -D_FORTIFY_SOURCE=2) + if (UR_DEVELOPER_MODE) + target_compile_options(${name} PRIVATE -Werror -Wextra) endif() - if(UR_DEVELOPER_MODE) - target_compile_options(${name} PRIVATE - -Werror - -fno-omit-frame-pointer - -fstack-protector-strong - ) + if (CMAKE_BUILD_TYPE STREQUAL "Release") + target_compile_options(${name} PRIVATE -fvisibility=hidden) endif() elseif(MSVC) target_compile_options(${name} PRIVATE @@ -101,7 +118,15 @@ endfunction() function(add_ur_target_link_options name) if(NOT MSVC) if (NOT APPLE) - target_link_options(${name} PRIVATE "LINKER:-z,relro,-z,now") + target_link_options(${name} PRIVATE "LINKER:-z,relro,-z,now,-z,noexecstack") + if (UR_DEVELOPER_MODE) + target_link_options(${name} PRIVATE -Werror -Wextra) + endif() + if (CMAKE_BUILD_TYPE STREQUAL "Release") + target_link_options(${name} PRIVATE + $<$:-pie> + ) + endif() endif() elseif(MSVC) target_link_options(${name} PRIVATE @@ -138,6 +163,15 @@ function(add_ur_library name) endif() endfunction() +function(install_ur_library name) + install(TARGETS ${name} + EXPORT ${PROJECT_NAME}-targets + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT unified-runtime + ) +endfunction() + include(FetchContent) function(FetchSource GIT_REPOSITORY GIT_TAG GIT_DIR DEST) diff --git a/cmake/match.py b/cmake/match.py index 5b96d3008f..8075ab201c 100755 --- a/cmake/match.py +++ b/cmake/match.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Copyright (C) 2023 Intel Corporation # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. @@ -12,6 +12,8 @@ # List of available special tags: # {{OPT}} - makes content in the same line as the tag optional # {{IGNORE}} - ignores all content until the next successfully matched line or the end of the input +# {{NONDETERMINISTIC}} - order of match rules isn't important - each (non OPT) input line is paired with a match line +# in any order # Special tags are mutually exclusive and are expected to be located at the start of a line. # @@ -20,15 +22,25 @@ import re from enum import Enum +## @brief print a sequence of lines +def print_lines(lines, hint = None): + counter = 1 + for l in lines: + hint_char = " " + if hint == counter - 1: + hint_char = ">" + print("{}{:4d}| {}".format(hint_char, counter, l.strip())) + counter += 1 + ## @brief print the whole content of input and match files -def print_content(input_lines, match_lines, ignored_lines): - print("--- Input Lines " + "-" * 64) - print("".join(input_lines).strip()) - print("--- Match Lines " + "-" * 64) - print("".join(match_lines).strip()) - print("--- Ignored Lines " + "-" * 62) - print("".join(ignored_lines).strip()) +def print_content(input_lines, match_lines, ignored_lines, hint_input = None, hint_match = None): + print("------ Input Lines " + "-" * 61) + print_lines(input_lines, hint_input) + print("------ Match Lines " + "-" * 61) + print_lines(match_lines, hint_match) + print("------ Ignored Lines " + "-" * 59) + print_lines(ignored_lines) print("-" * 80) @@ -39,6 +51,24 @@ def print_incorrect_match(match_line, present, expected): print("expected: " + expected) +## @brief print missing match line +def print_input_not_found(input_line, input): + print("Input line " + str(input_line) + " has no match line") + print("is: " + input) + + +## @brief print missing input line +def print_match_not_found(match_line, input): + print("Match line " + str(match_line) + " has no input line") + print("is: " + input) + + +## @brief print general syntax error +def print_error(text, match_line): + print("Line " + str(match_line) + " encountered an error") + print(text) + + ## @brief pattern matching script status values class Status(Enum): INPUT_END = 1 @@ -63,6 +93,7 @@ def check_status(input_lines, match_lines): class Tag(Enum): OPT = "{{OPT}}" # makes the line optional IGNORE = "{{IGNORE}}" # ignores all input until next match or end of input file + NONDETERMINISTIC = "{{NONDETERMINISTIC}}" # switches on "deterministic mode" COMMENT = "#" # comment - line ignored @@ -88,32 +119,53 @@ def main(): ) ignored_lines = [] + matched_lines = set() input_idx = 0 match_idx = 0 tags_in_effect = [] + deterministic_mode = False while True: # check file status status = check_status(input_lines[input_idx:], match_lines[match_idx:]) - if (status == Status.INPUT_AND_MATCH_END) or (status == Status.MATCH_END and Tag.IGNORE in tags_in_effect): - # all lines matched or the last line in match file is an ignore tag - sys.exit(0) - elif status == Status.MATCH_END: - print_incorrect_match(match_idx + 1, input_lines[input_idx].strip(), ""); - print_content(input_lines, match_lines, ignored_lines) - sys.exit(1) - elif status == Status.INPUT_END: - # If we get to the end of the input, but still have pending matches, - # then that's a failure unless all pending matches are optional - - # otherwise we're done - while match_idx < len(match_lines): - if not (match_lines[match_idx].startswith(Tag.OPT.value) or - match_lines[match_idx].startswith(Tag.IGNORE.value)): - print_incorrect_match(match_idx + 1, "", match_lines[match_idx]); - print_content(input_lines, match_lines, ignored_lines) + if deterministic_mode: + if status == Status.INPUT_END: + # Convert the list of seen matches to the list of unseen matches + remaining_matches = set(range(len(match_lines))) - matched_lines + for m in remaining_matches: + line = match_lines[m] + if line.startswith(Tag.OPT.value) or line.startswith(Tag.NONDETERMINISTIC.value): + continue + print_match_not_found(m + 1, match_lines[m]) + print_content(input_lines, match_lines, ignored_lines, hint_match=m) sys.exit(1) - match_idx += 1 - sys.exit(0) + + sys.exit(0) + elif status == Status.MATCH_END: + print_input_not_found(input_idx + 1, input_lines[input_idx]) + print_content(input_lines, match_lines, ignored_lines, hint_input=input_idx) + sys.exit(1) + else: + if (status == Status.INPUT_AND_MATCH_END) or (status == Status.MATCH_END and Tag.IGNORE in tags_in_effect): + # all lines matched or the last line in match file is an ignore tag + sys.exit(0) + elif status == Status.MATCH_END: + print_incorrect_match(input_idx + 1, input_lines[input_idx].strip(), "") + print_content(input_lines, match_lines, ignored_lines, hint_input=input_idx) + sys.exit(1) + elif status == Status.INPUT_END: + # If we get to the end of the input, but still have pending matches, + # then that's a failure unless all pending matches are optional - + # otherwise we're done + while match_idx < len(match_lines): + if not (match_lines[match_idx].startswith(Tag.OPT.value) or + match_lines[match_idx].startswith(Tag.IGNORE.value) or + match_lines[match_idx].startswith(Tag.NONDETERMINISTIC.value)): + print_incorrect_match(match_idx + 1, "", match_lines[match_idx]) + print_content(input_lines, match_lines, ignored_lines, hint_match=match_idx) + sys.exit(1) + match_idx += 1 + sys.exit(0) input_line = input_lines[input_idx].strip() if input_idx < len(input_lines) else "" match_line = match_lines[match_idx] @@ -122,7 +174,15 @@ def main(): if match_line.startswith(Tag.OPT.value): tags_in_effect.append(Tag.OPT) match_line = match_line[len(Tag.OPT.value):] + elif match_line.startswith(Tag.NONDETERMINISTIC.value) and not deterministic_mode: + deterministic_mode = True + match_idx = 0 + input_idx = 0 + continue elif match_line.startswith(Tag.IGNORE.value): + if deterministic_mode: + print_error(r"Can't use \{{IGNORE\}} in deterministic mode") + sys.exit(2) tags_in_effect.append(Tag.IGNORE) match_idx += 1 continue # line with ignore tag should be skipped @@ -137,20 +197,29 @@ def main(): pattern += part # match or process tags - if re.fullmatch(pattern, input_line): - input_idx += 1 - match_idx += 1 - tags_in_effect = [] - elif Tag.OPT in tags_in_effect: - match_idx += 1 - tags_in_effect.remove(Tag.OPT) - elif Tag.IGNORE in tags_in_effect: - ignored_lines.append(input_line + os.linesep) - input_idx += 1 + if deterministic_mode: + if re.fullmatch(pattern, input_line) and match_idx not in matched_lines: + input_idx += 1 + matched_lines.add(match_idx) + match_idx = 0 + tags_in_effect = [] + else: + match_idx += 1 else: - print_incorrect_match(match_idx + 1, input_line, match_line.strip()) - print_content(input_lines, match_lines, ignored_lines) - sys.exit(1) + if re.fullmatch(pattern, input_line): + input_idx += 1 + match_idx += 1 + tags_in_effect = [] + elif Tag.OPT in tags_in_effect: + match_idx += 1 + tags_in_effect.remove(Tag.OPT) + elif Tag.IGNORE in tags_in_effect: + ignored_lines.append(input_line + os.linesep) + input_idx += 1 + else: + print_incorrect_match(match_idx + 1, input_line, match_line.strip()) + print_content(input_lines, match_lines, ignored_lines, hint_match=match_idx, hint_input=input_idx) + sys.exit(1) if __name__ == "__main__": diff --git a/examples/collector/CMakeLists.txt b/examples/collector/CMakeLists.txt index 5fe484d0b8..6dd112aae0 100644 --- a/examples/collector/CMakeLists.txt +++ b/examples/collector/CMakeLists.txt @@ -17,6 +17,6 @@ target_link_libraries(${TARGET_NAME} PRIVATE ${TARGET_XPTI}) target_include_directories(${TARGET_NAME} PRIVATE ${xpti_SOURCE_DIR}/include) if(MSVC) - target_compile_definitions(${TARGET_NAME} PRIVATE - XPTI_STATIC_LIBRARY XPTI_CALLBACK_API_EXPORTS) + target_compile_definitions(${TARGET_NAME} PRIVATE XPTI_STATIC_LIBRARY) endif() +target_compile_definitions(${TARGET_NAME} PRIVATE XPTI_CALLBACK_API_EXPORTS) diff --git a/examples/collector/collector.cpp b/examples/collector/collector.cpp index 910964e02c..cc9580bc4f 100644 --- a/examples/collector/collector.cpp +++ b/examples/collector/collector.cpp @@ -31,7 +31,7 @@ constexpr uint16_t TRACE_FN_BEGIN = static_cast(xpti::trace_point_type_t::function_with_args_begin); constexpr uint16_t TRACE_FN_END = static_cast(xpti::trace_point_type_t::function_with_args_end); -constexpr std::string_view UR_STREAM_NAME = "ur"; +constexpr std::string_view UR_STREAM_NAME = "ur.call"; /** * @brief Formats the function parameters and arguments for urAdapterGet diff --git a/include/ur_api.h b/include/ur_api.h index 412c460b64..376532c9f4 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -7,7 +7,7 @@ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * * @file ur_api.h - * @version v0.10-r0 + * @version v0.11-r0 * */ #ifndef UR_API_H_INCLUDED @@ -150,7 +150,6 @@ typedef enum ur_function_t { UR_FUNCTION_COMMAND_BUFFER_RELEASE_EXP = 122, ///< Enumerator for ::urCommandBufferReleaseExp UR_FUNCTION_COMMAND_BUFFER_FINALIZE_EXP = 123, ///< Enumerator for ::urCommandBufferFinalizeExp UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_EXP = 125, ///< Enumerator for ::urCommandBufferAppendKernelLaunchExp - UR_FUNCTION_COMMAND_BUFFER_ENQUEUE_EXP = 128, ///< Enumerator for ::urCommandBufferEnqueueExp UR_FUNCTION_USM_PITCHED_ALLOC_EXP = 132, ///< Enumerator for ::urUSMPitchedAllocExp UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_HANDLE_DESTROY_EXP = 133, ///< Enumerator for ::urBindlessImagesUnsampledImageHandleDestroyExp UR_FUNCTION_BINDLESS_IMAGES_SAMPLED_IMAGE_HANDLE_DESTROY_EXP = 134, ///< Enumerator for ::urBindlessImagesSampledImageHandleDestroyExp @@ -163,7 +162,6 @@ typedef enum ur_function_t { UR_FUNCTION_BINDLESS_IMAGES_MIPMAP_GET_LEVEL_EXP = 141, ///< Enumerator for ::urBindlessImagesMipmapGetLevelExp UR_FUNCTION_BINDLESS_IMAGES_MIPMAP_FREE_EXP = 142, ///< Enumerator for ::urBindlessImagesMipmapFreeExp UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_ARRAY_EXP = 144, ///< Enumerator for ::urBindlessImagesMapExternalArrayExp - UR_FUNCTION_BINDLESS_IMAGES_RELEASE_INTEROP_EXP = 145, ///< Enumerator for ::urBindlessImagesReleaseInteropExp UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_SEMAPHORE_EXP = 147, ///< Enumerator for ::urBindlessImagesReleaseExternalSemaphoreExp UR_FUNCTION_BINDLESS_IMAGES_WAIT_EXTERNAL_SEMAPHORE_EXP = 148, ///< Enumerator for ::urBindlessImagesWaitExternalSemaphoreExp UR_FUNCTION_BINDLESS_IMAGES_SIGNAL_EXTERNAL_SEMAPHORE_EXP = 149, ///< Enumerator for ::urBindlessImagesSignalExternalSemaphoreExp @@ -200,17 +198,6 @@ typedef enum ur_function_t { UR_FUNCTION_LOADER_CONFIG_SET_CODE_LOCATION_CALLBACK = 200, ///< Enumerator for ::urLoaderConfigSetCodeLocationCallback UR_FUNCTION_LOADER_INIT = 201, ///< Enumerator for ::urLoaderInit UR_FUNCTION_LOADER_TEAR_DOWN = 202, ///< Enumerator for ::urLoaderTearDown - UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP = 203, ///< Enumerator for ::urCommandBufferAppendUSMMemcpyExp - UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_FILL_EXP = 204, ///< Enumerator for ::urCommandBufferAppendUSMFillExp - UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_EXP = 205, ///< Enumerator for ::urCommandBufferAppendMemBufferCopyExp - UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_EXP = 206, ///< Enumerator for ::urCommandBufferAppendMemBufferWriteExp - UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_EXP = 207, ///< Enumerator for ::urCommandBufferAppendMemBufferReadExp - UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_RECT_EXP = 208, ///< Enumerator for ::urCommandBufferAppendMemBufferCopyRectExp - UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_RECT_EXP = 209, ///< Enumerator for ::urCommandBufferAppendMemBufferWriteRectExp - UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_RECT_EXP = 210, ///< Enumerator for ::urCommandBufferAppendMemBufferReadRectExp - UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_FILL_EXP = 211, ///< Enumerator for ::urCommandBufferAppendMemBufferFillExp - UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_PREFETCH_EXP = 212, ///< Enumerator for ::urCommandBufferAppendUSMPrefetchExp - UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_ADVISE_EXP = 213, ///< Enumerator for ::urCommandBufferAppendUSMAdviseExp UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP = 214, ///< Enumerator for ::urEnqueueCooperativeKernelLaunchExp UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP = 215, ///< Enumerator for ::urKernelSuggestMaxCooperativeGroupCountExp UR_FUNCTION_PROGRAM_GET_GLOBAL_VARIABLE_POINTER = 216, ///< Enumerator for ::urProgramGetGlobalVariablePointer @@ -227,8 +214,24 @@ typedef enum ur_function_t { UR_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_SEMAPHORE_EXP = 227, ///< Enumerator for ::urBindlessImagesImportExternalSemaphoreExp UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP = 228, ///< Enumerator for ::urEnqueueNativeCommandExp UR_FUNCTION_LOADER_CONFIG_SET_MOCKING_ENABLED = 229, ///< Enumerator for ::urLoaderConfigSetMockingEnabled - UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP = 230, ///< Enumerator for ::urTensorMapEncodeIm2ColExp - UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP = 231, ///< Enumerator for ::urTensorMapEncodeTiledExp + UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP = 230, ///< Enumerator for ::urBindlessImagesReleaseExternalMemoryExp + UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP = 231, ///< Enumerator for ::urCommandBufferAppendUSMMemcpyExp + UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_FILL_EXP = 232, ///< Enumerator for ::urCommandBufferAppendUSMFillExp + UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_EXP = 233, ///< Enumerator for ::urCommandBufferAppendMemBufferCopyExp + UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_EXP = 234, ///< Enumerator for ::urCommandBufferAppendMemBufferWriteExp + UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_EXP = 235, ///< Enumerator for ::urCommandBufferAppendMemBufferReadExp + UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_RECT_EXP = 236, ///< Enumerator for ::urCommandBufferAppendMemBufferCopyRectExp + UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_RECT_EXP = 237, ///< Enumerator for ::urCommandBufferAppendMemBufferWriteRectExp + UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_RECT_EXP = 238, ///< Enumerator for ::urCommandBufferAppendMemBufferReadRectExp + UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_FILL_EXP = 239, ///< Enumerator for ::urCommandBufferAppendMemBufferFillExp + UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_PREFETCH_EXP = 240, ///< Enumerator for ::urCommandBufferAppendUSMPrefetchExp + UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_ADVISE_EXP = 241, ///< Enumerator for ::urCommandBufferAppendUSMAdviseExp + UR_FUNCTION_COMMAND_BUFFER_ENQUEUE_EXP = 242, ///< Enumerator for ::urCommandBufferEnqueueExp + UR_FUNCTION_COMMAND_BUFFER_UPDATE_SIGNAL_EVENT_EXP = 243, ///< Enumerator for ::urCommandBufferUpdateSignalEventExp + UR_FUNCTION_COMMAND_BUFFER_UPDATE_WAIT_EVENTS_EXP = 244, ///< Enumerator for ::urCommandBufferUpdateWaitEventsExp + UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP = 245, ///< Enumerator for ::urBindlessImagesMapExternalLinearMemoryExp + UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP = 246, ///< Enumerator for ::urTensorMapEncodeIm2ColExp + UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP = 247, ///< Enumerator for ::urTensorMapEncodeTiledExp /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -279,8 +282,8 @@ typedef enum ur_structure_type_t { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC = 0x1003, ///< ::ur_exp_command_buffer_update_pointer_arg_desc_t UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC = 0x1004, ///< ::ur_exp_command_buffer_update_value_arg_desc_t UR_STRUCTURE_TYPE_EXP_SAMPLER_MIP_PROPERTIES = 0x2000, ///< ::ur_exp_sampler_mip_properties_t - UR_STRUCTURE_TYPE_EXP_INTEROP_MEM_DESC = 0x2001, ///< ::ur_exp_interop_mem_desc_t - UR_STRUCTURE_TYPE_EXP_INTEROP_SEMAPHORE_DESC = 0x2002, ///< ::ur_exp_interop_semaphore_desc_t + UR_STRUCTURE_TYPE_EXP_EXTERNAL_MEM_DESC = 0x2001, ///< ::ur_exp_external_mem_desc_t + UR_STRUCTURE_TYPE_EXP_EXTERNAL_SEMAPHORE_DESC = 0x2002, ///< ::ur_exp_external_semaphore_desc_t UR_STRUCTURE_TYPE_EXP_FILE_DESCRIPTOR = 0x2003, ///< ::ur_exp_file_descriptor_t UR_STRUCTURE_TYPE_EXP_WIN32_HANDLE = 0x2004, ///< ::ur_exp_win32_handle_t UR_STRUCTURE_TYPE_EXP_SAMPLER_ADDR_MODES = 0x2005, ///< ::ur_exp_sampler_addr_modes_t @@ -333,9 +336,17 @@ typedef enum ur_structure_type_t { #if defined(_WIN32) /// @brief Microsoft-specific dllexport storage-class attribute #define UR_APIEXPORT __declspec(dllexport) +#endif // defined(_WIN32) +#endif // UR_APIEXPORT + +/////////////////////////////////////////////////////////////////////////////// +#ifndef UR_APIEXPORT +#if __GNUC__ >= 4 +/// @brief GCC-specific dllexport storage-class attribute +#define UR_APIEXPORT __attribute__((visibility("default"))) #else #define UR_APIEXPORT -#endif // defined(_WIN32) +#endif // __GNUC__ >= 4 #endif // UR_APIEXPORT /////////////////////////////////////////////////////////////////////////////// @@ -499,10 +510,11 @@ typedef enum ur_result_t { UR_RESULT_ERROR_INVALID_USM_SIZE = 64, ///< Invalid USM size UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE = 65, ///< Objection allocation failure UR_RESULT_ERROR_ADAPTER_SPECIFIC = 66, ///< An adapter specific warning/error has been reported and can be - ///< retrieved via the urPlatformGetLastError entry point. + ///< retrieved via the urAdapterGetLastError entry point. UR_RESULT_ERROR_LAYER_NOT_PRESENT = 67, ///< A requested layer was not found by the loader. UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS = 68, ///< An event in the provided wait list has ::UR_EVENT_STATUS_ERROR. UR_RESULT_ERROR_DEVICE_NOT_AVAILABLE = 69, ///< Device in question has `::UR_DEVICE_INFO_AVAILABLE == false` + UR_RESULT_ERROR_INVALID_SPEC_ID = 70, ///< A specialization constant identifier is not valid. UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP = 0x1000, ///< Invalid Command-Buffer UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP = 0x1001, ///< Sync point is not valid for the command-buffer UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP = 0x1002, ///< Sync point wait list is invalid @@ -700,7 +712,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urLoaderConfigEnableLayer( ur_loader_config_handle_t hLoaderConfig, ///< [in] Handle to config object the layer will be enabled for. const char *pLayerName ///< [in] Null terminated string containing the name of the layer to - ///< enable. + ///< enable. Empty if none are enabled. ); /////////////////////////////////////////////////////////////////////////////// @@ -842,6 +854,7 @@ urLoaderTearDown( /// - ::UR_RESULT_ERROR_DEVICE_LOST /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_SIZE +/// + `NumEntries == 0 && phAdapters != NULL` UR_APIEXPORT ur_result_t UR_APICALL urAdapterGet( uint32_t NumEntries, ///< [in] the number of adapters to be added to phAdapters. @@ -850,7 +863,7 @@ urAdapterGet( ///< will be returned. ur_adapter_handle_t *phAdapters, ///< [out][optional][range(0, NumEntries)] array of handle of adapters. ///< If NumEntries is less than the number of adapters available, then - ///< ::urAdapterGet shall only retrieve that number of platforms. + ///< ::urAdapterGet shall only retrieve that number of adapters. uint32_t *pNumAdapters ///< [out][optional] returns the total number of adapters available. ); @@ -1134,7 +1147,8 @@ typedef enum ur_api_version_t { UR_API_VERSION_0_8 = UR_MAKE_VERSION(0, 8), ///< version 0.8 UR_API_VERSION_0_9 = UR_MAKE_VERSION(0, 9), ///< version 0.9 UR_API_VERSION_0_10 = UR_MAKE_VERSION(0, 10), ///< version 0.10 - UR_API_VERSION_CURRENT = UR_MAKE_VERSION(0, 10), ///< latest known version + UR_API_VERSION_0_11 = UR_MAKE_VERSION(0, 11), ///< version 0.11 + UR_API_VERSION_CURRENT = UR_MAKE_VERSION(0, 11), ///< latest known version /// @cond UR_API_VERSION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -1289,10 +1303,8 @@ typedef enum ur_platform_backend_t { #endif /////////////////////////////////////////////////////////////////////////////// #ifndef UR_DEVICE_BINARY_TARGET_UNKNOWN -/// @brief Target identification strings for -/// ::ur_device_binary_t.pDeviceTargetSpec -/// A device type represented by a particular target triple requires -/// specific +/// @brief Target identification strings for ::ur_device_binary_t.pDeviceTargetSpec +/// A device type represented by a particular target triple requires specific /// binary images. We need to map the image type onto the device target triple #define UR_DEVICE_BINARY_TARGET_UNKNOWN "" #endif // UR_DEVICE_BINARY_TARGET_UNKNOWN @@ -1569,8 +1581,7 @@ typedef enum ur_device_info_t { ///< ::urDevicePartition UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS = 80, ///< [uint32_t] max number of sub groups UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS = 81, ///< [::ur_bool_t] support sub group independent forward progress - UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL = 82, ///< [uint32_t[]] return an array of sub group sizes supported on Intel - ///< device + UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL = 82, ///< [uint32_t[]] return an array of supported sub group sizes UR_DEVICE_INFO_USM_HOST_SUPPORT = 83, ///< [::ur_device_usm_access_capability_flags_t] support USM host memory ///< access UR_DEVICE_INFO_USM_DEVICE_SUPPORT = 84, ///< [::ur_device_usm_access_capability_flags_t] support USM device memory @@ -1630,8 +1641,10 @@ typedef enum ur_device_info_t { ///< `EnqueueDeviceGlobalVariableRead` entry points. UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP = 0x1000, ///< [::ur_bool_t] Returns true if the device supports the use of ///< command-buffers. - UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP = 0x1001, ///< [::ur_bool_t] Returns true if the device supports updating the kernel - ///< commands in a command-buffer. + UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP = 0x1001, ///< [::ur_device_command_buffer_update_capability_flags_t] Command-buffer + ///< update capabilities of the device + UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP = 0x1002, ///< [::ur_bool_t] Returns true if the device supports using event objects + ///< for command synchronization outside of a command-buffer. UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP = 0x1111, ///< [::ur_bool_t] return true if enqueue Cluster Launch is supported UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP = 0x2000, ///< [::ur_bool_t] returns true if the device supports the creation of ///< bindless images @@ -1657,9 +1670,9 @@ typedef enum ur_device_info_t { ///< device UR_DEVICE_INFO_MIPMAP_LEVEL_REFERENCE_SUPPORT_EXP = 0x200B, ///< [::ur_bool_t] returns true if the device supports using images created ///< from individual mipmap levels - UR_DEVICE_INFO_INTEROP_MEMORY_IMPORT_SUPPORT_EXP = 0x200C, ///< [::ur_bool_t] returns true if the device supports importing external + UR_DEVICE_INFO_EXTERNAL_MEMORY_IMPORT_SUPPORT_EXP = 0x200C, ///< [::ur_bool_t] returns true if the device supports importing external ///< memory resources - UR_DEVICE_INFO_INTEROP_SEMAPHORE_IMPORT_SUPPORT_EXP = 0x200E, ///< [::ur_bool_t] returns true if the device supports importing external + UR_DEVICE_INFO_EXTERNAL_SEMAPHORE_IMPORT_SUPPORT_EXP = 0x200E, ///< [::ur_bool_t] returns true if the device supports importing external ///< semaphore resources UR_DEVICE_INFO_CUBEMAP_SUPPORT_EXP = 0x2010, ///< [::ur_bool_t] returns true if the device supports allocating and ///< accessing cubemap resources @@ -1816,8 +1829,7 @@ typedef enum ur_device_affinity_domain_flag_t { ///< ::UR_DEVICE_AFFINITY_DOMAIN_FLAG_NUMA, ///< ::UR_DEVICE_AFFINITY_DOMAIN_FLAG_L4_CACHE, ///< ::UR_DEVICE_AFFINITY_DOMAIN_FLAG_L3_CACHE, - ///< ::UR_DEVICE_AFFINITY_DOMAIN_FLAG_L2_CACHE, - ///< ::UR_DEVICE_AFFINITY_DOMAIN_FLAG_L1_CACHE, + ///< ::UR_DEVICE_AFFINITY_DOMAIN_FLAG_L2_CACHE, ::UR_DEVICE_AFFINITY_DOMAIN_FLAG_L1_CACHE, ///< and partition the device into sub devices comprised of compute units ///< that share memory subsystems at this level. /// @cond @@ -2063,7 +2075,7 @@ typedef struct ur_device_native_properties_t { /// - ::UR_RESULT_ERROR_DEVICE_LOST /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE -/// + `NULL == hPlatform` +/// + `NULL == hAdapter` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == phDevice` /// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE @@ -2071,7 +2083,7 @@ typedef struct ur_device_native_properties_t { UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_native_handle_t hNativeDevice, ///< [in][nocheck] the native handle of the device. - ur_platform_handle_t hPlatform, ///< [in] handle of the platform instance + ur_adapter_handle_t hAdapter, ///< [in] handle of the adapter to which `hNativeDevice` belongs const ur_device_native_properties_t *pProperties, ///< [in][optional] pointer to native device properties struct. ur_device_handle_t *phDevice ///< [out] pointer to the handle of the device object created. ); @@ -2602,8 +2614,7 @@ typedef struct ur_image_desc_t { /// /// @details /// - The primary ::ur_image_format_t that must be supported by all the -/// adapters are {UR_IMAGE_CHANNEL_ORDER_RGBA, -/// UR_IMAGE_CHANNEL_TYPE_UNORM_INT8}, +/// adapters are {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT8}, /// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT16}, /// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT8}, /// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT16}, @@ -3798,7 +3809,7 @@ urUSMPoolGetInfo( #endif // Intel 'oneAPI' Unified Runtime APIs #if !defined(__GNUC__) -#pragma region virtual memory +#pragma region virtual_memory #endif /////////////////////////////////////////////////////////////////////////////// /// @brief Virtual memory granularity info @@ -4483,7 +4494,9 @@ typedef enum ur_program_info_t { UR_PROGRAM_INFO_DEVICES = 3, ///< [::ur_device_handle_t[]] Return list of devices associated with a program. ///< This is either the list of devices associated with the context or a ///< subset of those devices when the program is created using ::urProgramCreateWithBinary. - UR_PROGRAM_INFO_SOURCE = 4, ///< [char[]] Return program source associated with Program. + UR_PROGRAM_INFO_IL = 4, ///< [char[]] Return program IL if the program was created with + ///< ::urProgramCreateWithIL, otherwise return size will be set to 0 and + ///< nothing will be returned. UR_PROGRAM_INFO_BINARY_SIZES = 5, ///< [size_t[]] Return program binary sizes for each device. UR_PROGRAM_INFO_BINARIES = 6, ///< [unsigned char[]] Return program binaries for all devices for this ///< Program. @@ -4641,6 +4654,11 @@ typedef struct ur_specialization_constant_info_t { /// + `NULL == pSpecConstants` /// - ::UR_RESULT_ERROR_INVALID_SIZE /// + `count == 0` +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// + A pSpecConstant entry contains a size that does not match that of the specialization constant in the module. +/// + A pSpecConstant entry contains a nullptr pValue. +/// - ::UR_RESULT_ERROR_INVALID_SPEC_ID +/// + Any id specified in a pSpecConstant entry is not a valid specialization constant identifier. UR_APIEXPORT ur_result_t UR_APICALL urProgramSetSpecializationConstants( ur_program_handle_t hProgram, ///< [in] handle of the Program object @@ -4786,6 +4804,7 @@ urKernelSetArgValue( size_t argSize, ///< [in] size of argument type const ur_kernel_arg_value_properties_t *pProperties, ///< [in][optional] pointer to value properties. const void *pArgValue ///< [in] argument value represented as matching arg type. + ///< The data pointed to will be copied and therefore can be reused on return. ); /////////////////////////////////////////////////////////////////////////////// @@ -4854,6 +4873,10 @@ typedef enum ur_kernel_group_info_t { UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE = 4, ///< [size_t] Return preferred multiple of Work Group size for launch UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE = 5, ///< [size_t] Return minimum amount of private memory in bytes used by each ///< work item in the Kernel + UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE = 6, ///< [size_t[3]] Return the maximum Work Group size guaranteed by the + ///< source code, or (0, 0, 0) if unspecified + UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE = 7, ///< [size_t] Return the maximum linearized Work Group size (X * Y * Z) + ///< guaranteed by the source code, or 0 if unspecified /// @cond UR_KERNEL_GROUP_INFO_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -4957,7 +4980,7 @@ urKernelGetInfo( /// + `NULL == hKernel` /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE < propName` +/// + `::UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE < propName` UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo( ur_kernel_handle_t hKernel, ///< [in] handle of the Kernel object @@ -5227,6 +5250,11 @@ urKernelSetArgMemObj( /// + `count == 0` /// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE /// + If ::UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS query is false +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// + A pSpecConstant entry contains a size that does not match that of the specialization constant in the module. +/// + A pSpecConstant entry contains a nullptr pValue. +/// - ::UR_RESULT_ERROR_INVALID_SPEC_ID +/// + Any id specified in a pSpecConstant entry is not a valid specialization constant identifier. UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object @@ -5712,36 +5740,36 @@ urQueueFlush( /////////////////////////////////////////////////////////////////////////////// /// @brief Command type typedef enum ur_command_t { - UR_COMMAND_KERNEL_LAUNCH = 0, ///< Event created by ::urEnqueueKernelLaunch - UR_COMMAND_EVENTS_WAIT = 1, ///< Event created by ::urEnqueueEventsWait - UR_COMMAND_EVENTS_WAIT_WITH_BARRIER = 2, ///< Event created by ::urEnqueueEventsWaitWithBarrier - UR_COMMAND_MEM_BUFFER_READ = 3, ///< Event created by ::urEnqueueMemBufferRead - UR_COMMAND_MEM_BUFFER_WRITE = 4, ///< Event created by ::urEnqueueMemBufferWrite - UR_COMMAND_MEM_BUFFER_READ_RECT = 5, ///< Event created by ::urEnqueueMemBufferReadRect - UR_COMMAND_MEM_BUFFER_WRITE_RECT = 6, ///< Event created by ::urEnqueueMemBufferWriteRect - UR_COMMAND_MEM_BUFFER_COPY = 7, ///< Event created by ::urEnqueueMemBufferCopy - UR_COMMAND_MEM_BUFFER_COPY_RECT = 8, ///< Event created by ::urEnqueueMemBufferCopyRect - UR_COMMAND_MEM_BUFFER_FILL = 9, ///< Event created by ::urEnqueueMemBufferFill - UR_COMMAND_MEM_IMAGE_READ = 10, ///< Event created by ::urEnqueueMemImageRead - UR_COMMAND_MEM_IMAGE_WRITE = 11, ///< Event created by ::urEnqueueMemImageWrite - UR_COMMAND_MEM_IMAGE_COPY = 12, ///< Event created by ::urEnqueueMemImageCopy - UR_COMMAND_MEM_BUFFER_MAP = 14, ///< Event created by ::urEnqueueMemBufferMap - UR_COMMAND_MEM_UNMAP = 16, ///< Event created by ::urEnqueueMemUnmap - UR_COMMAND_USM_FILL = 17, ///< Event created by ::urEnqueueUSMFill - UR_COMMAND_USM_MEMCPY = 18, ///< Event created by ::urEnqueueUSMMemcpy - UR_COMMAND_USM_PREFETCH = 19, ///< Event created by ::urEnqueueUSMPrefetch - UR_COMMAND_USM_ADVISE = 20, ///< Event created by ::urEnqueueUSMAdvise - UR_COMMAND_USM_FILL_2D = 21, ///< Event created by ::urEnqueueUSMFill2D - UR_COMMAND_USM_MEMCPY_2D = 22, ///< Event created by ::urEnqueueUSMMemcpy2D - UR_COMMAND_DEVICE_GLOBAL_VARIABLE_WRITE = 23, ///< Event created by ::urEnqueueDeviceGlobalVariableWrite - UR_COMMAND_DEVICE_GLOBAL_VARIABLE_READ = 24, ///< Event created by ::urEnqueueDeviceGlobalVariableRead - UR_COMMAND_READ_HOST_PIPE = 25, ///< Event created by ::urEnqueueReadHostPipe - UR_COMMAND_WRITE_HOST_PIPE = 26, ///< Event created by ::urEnqueueWriteHostPipe - UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP = 0x1000, ///< Event created by ::urCommandBufferEnqueueExp - UR_COMMAND_INTEROP_SEMAPHORE_WAIT_EXP = 0x2000, ///< Event created by ::urBindlessImagesWaitExternalSemaphoreExp - UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP = 0x2001, ///< Event created by ::urBindlessImagesSignalExternalSemaphoreExp - UR_COMMAND_TIMESTAMP_RECORDING_EXP = 0x2002, ///< Event created by ::urEnqueueTimestampRecordingExp - UR_COMMAND_ENQUEUE_NATIVE_EXP = 0x2004, ///< Event created by ::urEnqueueNativeCommandExp + UR_COMMAND_KERNEL_LAUNCH = 0, ///< Event created by ::urEnqueueKernelLaunch + UR_COMMAND_EVENTS_WAIT = 1, ///< Event created by ::urEnqueueEventsWait + UR_COMMAND_EVENTS_WAIT_WITH_BARRIER = 2, ///< Event created by ::urEnqueueEventsWaitWithBarrier + UR_COMMAND_MEM_BUFFER_READ = 3, ///< Event created by ::urEnqueueMemBufferRead + UR_COMMAND_MEM_BUFFER_WRITE = 4, ///< Event created by ::urEnqueueMemBufferWrite + UR_COMMAND_MEM_BUFFER_READ_RECT = 5, ///< Event created by ::urEnqueueMemBufferReadRect + UR_COMMAND_MEM_BUFFER_WRITE_RECT = 6, ///< Event created by ::urEnqueueMemBufferWriteRect + UR_COMMAND_MEM_BUFFER_COPY = 7, ///< Event created by ::urEnqueueMemBufferCopy + UR_COMMAND_MEM_BUFFER_COPY_RECT = 8, ///< Event created by ::urEnqueueMemBufferCopyRect + UR_COMMAND_MEM_BUFFER_FILL = 9, ///< Event created by ::urEnqueueMemBufferFill + UR_COMMAND_MEM_IMAGE_READ = 10, ///< Event created by ::urEnqueueMemImageRead + UR_COMMAND_MEM_IMAGE_WRITE = 11, ///< Event created by ::urEnqueueMemImageWrite + UR_COMMAND_MEM_IMAGE_COPY = 12, ///< Event created by ::urEnqueueMemImageCopy + UR_COMMAND_MEM_BUFFER_MAP = 14, ///< Event created by ::urEnqueueMemBufferMap + UR_COMMAND_MEM_UNMAP = 16, ///< Event created by ::urEnqueueMemUnmap + UR_COMMAND_USM_FILL = 17, ///< Event created by ::urEnqueueUSMFill + UR_COMMAND_USM_MEMCPY = 18, ///< Event created by ::urEnqueueUSMMemcpy + UR_COMMAND_USM_PREFETCH = 19, ///< Event created by ::urEnqueueUSMPrefetch + UR_COMMAND_USM_ADVISE = 20, ///< Event created by ::urEnqueueUSMAdvise + UR_COMMAND_USM_FILL_2D = 21, ///< Event created by ::urEnqueueUSMFill2D + UR_COMMAND_USM_MEMCPY_2D = 22, ///< Event created by ::urEnqueueUSMMemcpy2D + UR_COMMAND_DEVICE_GLOBAL_VARIABLE_WRITE = 23, ///< Event created by ::urEnqueueDeviceGlobalVariableWrite + UR_COMMAND_DEVICE_GLOBAL_VARIABLE_READ = 24, ///< Event created by ::urEnqueueDeviceGlobalVariableRead + UR_COMMAND_READ_HOST_PIPE = 25, ///< Event created by ::urEnqueueReadHostPipe + UR_COMMAND_WRITE_HOST_PIPE = 26, ///< Event created by ::urEnqueueWriteHostPipe + UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP = 0x1000, ///< Event created by ::urCommandBufferEnqueueExp + UR_COMMAND_EXTERNAL_SEMAPHORE_WAIT_EXP = 0x2000, ///< Event created by ::urBindlessImagesWaitExternalSemaphoreExp + UR_COMMAND_EXTERNAL_SEMAPHORE_SIGNAL_EXP = 0x2001, ///< Event created by ::urBindlessImagesSignalExternalSemaphoreExp + UR_COMMAND_TIMESTAMP_RECORDING_EXP = 0x2002, ///< Event created by ::urEnqueueTimestampRecordingExp + UR_COMMAND_ENQUEUE_NATIVE_EXP = 0x2004, ///< Event created by ::urEnqueueNativeCommandExp /// @cond UR_COMMAND_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -6126,15 +6154,14 @@ urEnqueueKernelLaunch( const size_t *pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that ///< specify the number of local work-items forming a work-group that will ///< execute the kernel function. - ///< If nullptr, the runtime implementation will choose the work-group - ///< size. + ///< If nullptr, the runtime implementation will choose the work-group size. uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -6178,7 +6205,8 @@ urEnqueueEventsWait( ///< previously enqueued commands ///< must be complete. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -6224,7 +6252,8 @@ urEnqueueEventsWaitWithBarrier( ///< previously enqueued commands ///< must be complete. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -6275,7 +6304,8 @@ urEnqueueMemBufferRead( ///< If nullptr, the numEventsInWaitList must be 0, indicating that this ///< command does not wait on any event to complete. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -6326,7 +6356,8 @@ urEnqueueMemBufferWrite( ///< If nullptr, the numEventsInWaitList must be 0, indicating that this ///< command does not wait on any event to complete. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -6394,7 +6425,8 @@ urEnqueueMemBufferReadRect( ///< If nullptr, the numEventsInWaitList must be 0, indicating that this ///< command does not wait on any event to complete. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -6463,7 +6495,8 @@ urEnqueueMemBufferWriteRect( ///< If nullptr, the numEventsInWaitList must be 0, indicating that this ///< command does not wait on any event to complete. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -6510,7 +6543,8 @@ urEnqueueMemBufferCopy( ///< If nullptr, the numEventsInWaitList must be 0, indicating that this ///< command does not wait on any event to complete. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -6569,7 +6603,8 @@ urEnqueueMemBufferCopyRect( ///< If nullptr, the numEventsInWaitList must be 0, indicating that this ///< command does not wait on any event to complete. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -6622,7 +6657,8 @@ urEnqueueMemBufferFill( ///< If nullptr, the numEventsInWaitList must be 0, indicating that this ///< command does not wait on any event to complete. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -6677,7 +6713,8 @@ urEnqueueMemImageRead( ///< If nullptr, the numEventsInWaitList must be 0, indicating that this ///< command does not wait on any event to complete. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -6732,7 +6769,8 @@ urEnqueueMemImageWrite( ///< If nullptr, the numEventsInWaitList must be 0, indicating that this ///< command does not wait on any event to complete. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -6781,7 +6819,8 @@ urEnqueueMemImageCopy( ///< If nullptr, the numEventsInWaitList must be 0, indicating that this ///< command does not wait on any event to complete. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -6866,7 +6905,8 @@ urEnqueueMemBufferMap( ///< If nullptr, the numEventsInWaitList must be 0, indicating that this ///< command does not wait on any event to complete. ur_event_handle_t *phEvent, ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. void **ppRetMap ///< [out] return mapped pointer. TODO: move it before ///< numEventsInWaitList? ); @@ -6911,7 +6951,8 @@ urEnqueueMemUnmap( ///< If nullptr, the numEventsInWaitList must be 0, indicating that this ///< command does not wait on any event to complete. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -6957,7 +6998,8 @@ urEnqueueUSMFill( ///< If nullptr, the numEventsInWaitList must be 0, indicating that this ///< command does not wait on any event to complete. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -7000,7 +7042,8 @@ urEnqueueUSMMemcpy( ///< If nullptr, the numEventsInWaitList must be 0, indicating that this ///< command does not wait on any event to complete. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -7048,7 +7091,8 @@ urEnqueueUSMPrefetch( ///< If nullptr, the numEventsInWaitList must be 0, indicating that this ///< command does not wait on any event to complete. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -7135,10 +7179,10 @@ urEnqueueUSMFill2D( uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -7186,10 +7230,10 @@ urEnqueueUSMMemcpy2D( uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -7225,10 +7269,10 @@ urEnqueueDeviceGlobalVariableWrite( uint32_t numEventsInWaitList, ///< [in] size of the event wait list. const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -7264,10 +7308,10 @@ urEnqueueDeviceGlobalVariableRead( uint32_t numEventsInWaitList, ///< [in] size of the event wait list. const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -7305,9 +7349,10 @@ urEnqueueReadHostPipe( const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the host pipe read. ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. - ur_event_handle_t *phEvent ///< [out][optional] returns an event object that identifies this read - ///< command + ur_event_handle_t *phEvent ///< [out][optional] returns an event object that identifies this read command ///< and can be used to query or queue a wait for this command to complete. + ///< If phEventWaitList and phEvent are not NULL, phEvent must not refer to + ///< an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -7348,6 +7393,8 @@ urEnqueueWriteHostPipe( ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t *phEvent ///< [out][optional] returns an event object that identifies this write command ///< and can be used to query or queue a wait for this command to complete. + ///< If phEventWaitList and phEvent are not NULL, phEvent must not refer to + ///< an element of the phEventWaitList array. ); #if !defined(__GNUC__) @@ -7355,7 +7402,7 @@ urEnqueueWriteHostPipe( #endif // Bindless Images Extension APIs #if !defined(__GNUC__) -#pragma region bindless images(experimental) +#pragma region bindless_images_(experimental) #endif /////////////////////////////////////////////////////////////////////////////// /// @brief Handle of bindless image @@ -7366,12 +7413,12 @@ typedef uintptr_t ur_exp_image_native_handle_t; typedef uintptr_t ur_exp_image_mem_native_handle_t; /////////////////////////////////////////////////////////////////////////////// -/// @brief Handle of interop memory -typedef struct ur_exp_interop_mem_handle_t_ *ur_exp_interop_mem_handle_t; +/// @brief Handle of external memory +typedef struct ur_exp_external_mem_handle_t_ *ur_exp_external_mem_handle_t; /////////////////////////////////////////////////////////////////////////////// -/// @brief Handle of interop semaphore -typedef struct ur_exp_interop_semaphore_handle_t_ *ur_exp_interop_semaphore_handle_t; +/// @brief Handle of external semaphore +typedef struct ur_exp_external_semaphore_handle_t_ *ur_exp_external_semaphore_handle_t; /////////////////////////////////////////////////////////////////////////////// /// @brief Dictates the type of memory copy. @@ -7493,22 +7540,22 @@ typedef struct ur_exp_sampler_cubemap_properties_t { } ur_exp_sampler_cubemap_properties_t; /////////////////////////////////////////////////////////////////////////////// -/// @brief Describes an interop memory resource descriptor -typedef struct ur_exp_interop_mem_desc_t { +/// @brief Describes an external memory resource descriptor +typedef struct ur_exp_external_mem_desc_t { ur_structure_type_t stype; ///< [in] type of this structure, must be - ///< ::UR_STRUCTURE_TYPE_EXP_INTEROP_MEM_DESC + ///< ::UR_STRUCTURE_TYPE_EXP_EXTERNAL_MEM_DESC const void *pNext; ///< [in][optional] pointer to extension-specific structure -} ur_exp_interop_mem_desc_t; +} ur_exp_external_mem_desc_t; /////////////////////////////////////////////////////////////////////////////// -/// @brief Describes an interop semaphore resource descriptor -typedef struct ur_exp_interop_semaphore_desc_t { +/// @brief Describes an external semaphore resource descriptor +typedef struct ur_exp_external_semaphore_desc_t { ur_structure_type_t stype; ///< [in] type of this structure, must be - ///< ::UR_STRUCTURE_TYPE_EXP_INTEROP_SEMAPHORE_DESC + ///< ::UR_STRUCTURE_TYPE_EXP_EXTERNAL_SEMAPHORE_DESC const void *pNext; ///< [in][optional] pointer to extension-specific structure -} ur_exp_interop_semaphore_desc_t; +} ur_exp_external_semaphore_desc_t; /////////////////////////////////////////////////////////////////////////////// /// @brief Describes the (sub-)regions and the extent to be copied @@ -7813,7 +7860,8 @@ urBindlessImagesImageCopyExp( ///< previously enqueued commands ///< must be complete. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -7916,23 +7964,23 @@ urBindlessImagesMipmapFreeExp( /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION /// + `::UR_EXP_EXTERNAL_MEM_TYPE_WIN32_NT_DX12_RESOURCE < memHandleType` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pInteropMemDesc` -/// + `NULL == phInteropMem` +/// + `NULL == pExternalMemDesc` +/// + `NULL == phExternalMem` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( - ur_context_handle_t hContext, ///< [in] handle of the context object - ur_device_handle_t hDevice, ///< [in] handle of the device object - size_t size, ///< [in] size of the external memory - ur_exp_external_mem_type_t memHandleType, ///< [in] type of external memory handle - ur_exp_interop_mem_desc_t *pInteropMemDesc, ///< [in] the interop memory descriptor - ur_exp_interop_mem_handle_t *phInteropMem ///< [out] interop memory handle to the external memory + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_device_handle_t hDevice, ///< [in] handle of the device object + size_t size, ///< [in] size of the external memory + ur_exp_external_mem_type_t memHandleType, ///< [in] type of external memory handle + ur_exp_external_mem_desc_t *pExternalMemDesc, ///< [in] the external memory descriptor + ur_exp_external_mem_handle_t *phExternalMem ///< [out] external memory handle to the external memory ); /////////////////////////////////////////////////////////////////////////////// -/// @brief Map an interop memory handle to an image memory handle +/// @brief Map an external memory handle to an image memory handle /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -7942,7 +7990,7 @@ urBindlessImagesImportExternalMemoryExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hContext` /// + `NULL == hDevice` -/// + `NULL == hInteropMem` +/// + `NULL == hExternalMem` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pImageFormat` /// + `NULL == pImageDesc` @@ -7960,12 +8008,42 @@ urBindlessImagesMapExternalArrayExp( ur_device_handle_t hDevice, ///< [in] handle of the device object const ur_image_format_t *pImageFormat, ///< [in] pointer to image format specification const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description - ur_exp_interop_mem_handle_t hInteropMem, ///< [in] interop memory handle to the external memory + ur_exp_external_mem_handle_t hExternalMem, ///< [in] external memory handle to the external memory ur_exp_image_mem_native_handle_t *phImageMem ///< [out] image memory handle to the externally allocated memory ); /////////////////////////////////////////////////////////////////////////////// -/// @brief Release interop memory +/// @brief Map an external memory handle to a device memory region described by +/// void* +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hContext` +/// + `NULL == hDevice` +/// + `NULL == hExternalMem` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == ppRetMem` +/// - ::UR_RESULT_ERROR_INVALID_CONTEXT +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE +/// - ::UR_RESULT_ERROR_INVALID_OPERATION +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +UR_APIEXPORT ur_result_t UR_APICALL +urBindlessImagesMapExternalLinearMemoryExp( + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_device_handle_t hDevice, ///< [in] handle of the device object + uint64_t offset, ///< [in] offset into memory region to map + uint64_t size, ///< [in] size of memory region to map + ur_exp_external_mem_handle_t hExternalMem, ///< [in] external memory handle to the external memory + void **ppRetMem ///< [out] pointer of the externally allocated memory +); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Release external memory /// /// @remarks /// _Analogues_ @@ -7979,14 +8057,14 @@ urBindlessImagesMapExternalArrayExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hContext` /// + `NULL == hDevice` -/// + `NULL == hInteropMem` +/// + `NULL == hExternalMem` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE UR_APIEXPORT ur_result_t UR_APICALL -urBindlessImagesReleaseInteropExp( - ur_context_handle_t hContext, ///< [in] handle of the context object - ur_device_handle_t hDevice, ///< [in] handle of the device object - ur_exp_interop_mem_handle_t hInteropMem ///< [in][release] handle of interop memory to be destroyed +urBindlessImagesReleaseExternalMemoryExp( + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_device_handle_t hDevice, ///< [in] handle of the device object + ur_exp_external_mem_handle_t hExternalMem ///< [in][release] handle of external memory to be destroyed ); /////////////////////////////////////////////////////////////////////////////// @@ -8007,17 +8085,17 @@ urBindlessImagesReleaseInteropExp( /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION /// + `::UR_EXP_EXTERNAL_SEMAPHORE_TYPE_WIN32_NT_DX12_FENCE < semHandleType` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pInteropSemaphoreDesc` -/// + `NULL == phInteropSemaphore` +/// + `NULL == pExternalSemaphoreDesc` +/// + `NULL == phExternalSemaphore` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( - ur_context_handle_t hContext, ///< [in] handle of the context object - ur_device_handle_t hDevice, ///< [in] handle of the device object - ur_exp_external_semaphore_type_t semHandleType, ///< [in] type of external memory handle - ur_exp_interop_semaphore_desc_t *pInteropSemaphoreDesc, ///< [in] the interop semaphore descriptor - ur_exp_interop_semaphore_handle_t *phInteropSemaphore ///< [out] interop semaphore handle to the external semaphore + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_device_handle_t hDevice, ///< [in] handle of the device object + ur_exp_external_semaphore_type_t semHandleType, ///< [in] type of external memory handle + ur_exp_external_semaphore_desc_t *pExternalSemaphoreDesc, ///< [in] the external semaphore descriptor + ur_exp_external_semaphore_handle_t *phExternalSemaphore ///< [out] external semaphore handle to the external semaphore ); /////////////////////////////////////////////////////////////////////////////// @@ -8035,14 +8113,14 @@ urBindlessImagesImportExternalSemaphoreExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hContext` /// + `NULL == hDevice` -/// + `NULL == hInteropSemaphore` +/// + `NULL == hExternalSemaphore` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( - ur_context_handle_t hContext, ///< [in] handle of the context object - ur_device_handle_t hDevice, ///< [in] handle of the device object - ur_exp_interop_semaphore_handle_t hInteropSemaphore ///< [in][release] handle of interop semaphore to be destroyed + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_device_handle_t hDevice, ///< [in] handle of the device object + ur_exp_external_semaphore_handle_t hExternalSemaphore ///< [in][release] handle of external semaphore to be destroyed ); /////////////////////////////////////////////////////////////////////////////// @@ -8064,21 +8142,22 @@ urBindlessImagesReleaseExternalSemaphoreExp( /// - ::UR_RESULT_ERROR_INVALID_VALUE UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( - ur_queue_handle_t hQueue, ///< [in] handle of the queue object - ur_exp_interop_semaphore_handle_t hSemaphore, ///< [in] interop semaphore handle - bool hasWaitValue, ///< [in] indicates whether the samephore is capable and should wait on a - ///< certain value. - ///< Otherwise the semaphore is treated like a binary state, and - ///< `waitValue` is ignored. - uint64_t waitValue, ///< [in] the value to be waited on - uint32_t numEventsInWaitList, ///< [in] size of the event wait list - const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of - ///< events that must be complete before this command can be executed. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that all - ///< previously enqueued commands - ///< must be complete. - ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_exp_external_semaphore_handle_t hSemaphore, ///< [in] external semaphore handle + bool hasWaitValue, ///< [in] indicates whether the samephore is capable and should wait on a + ///< certain value. + ///< Otherwise the semaphore is treated like a binary state, and + ///< `waitValue` is ignored. + uint64_t waitValue, ///< [in] the value to be waited on + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before this command can be executed. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that all + ///< previously enqueued commands + ///< must be complete. + ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -8101,21 +8180,22 @@ urBindlessImagesWaitExternalSemaphoreExp( /// - ::UR_RESULT_ERROR_INVALID_VALUE UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( - ur_queue_handle_t hQueue, ///< [in] handle of the queue object - ur_exp_interop_semaphore_handle_t hSemaphore, ///< [in] interop semaphore handle - bool hasSignalValue, ///< [in] indicates whether the samephore is capable and should signal on a - ///< certain value. - ///< Otherwise the semaphore is treated like a binary state, and - ///< `signalValue` is ignored. - uint64_t signalValue, ///< [in] the value to be signalled - uint32_t numEventsInWaitList, ///< [in] size of the event wait list - const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of - ///< events that must be complete before this command can be executed. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that all - ///< previously enqueued commands - ///< must be complete. - ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_exp_external_semaphore_handle_t hSemaphore, ///< [in] external semaphore handle + bool hasSignalValue, ///< [in] indicates whether the samephore is capable and should signal on a + ///< certain value. + ///< Otherwise the semaphore is treated like a binary state, and + ///< `signalValue` is ignored. + uint64_t signalValue, ///< [in] the value to be signalled + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before this command can be executed. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that all + ///< previously enqueued commands + ///< must be complete. + ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ); #if !defined(__GNUC__) @@ -8123,8 +8203,31 @@ urBindlessImagesSignalExternalSemaphoreExp( #endif // Intel 'oneAPI' Unified Runtime Experimental APIs for Command-Buffers #if !defined(__GNUC__) -#pragma region command buffer(experimental) +#pragma region command_buffer_(experimental) #endif +/////////////////////////////////////////////////////////////////////////////// +/// @brief Device kernel execution capability +typedef uint32_t ur_device_command_buffer_update_capability_flags_t; +typedef enum ur_device_command_buffer_update_capability_flag_t { + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS = UR_BIT(0), ///< Device supports updating the kernel arguments in command-buffer + ///< commands. + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE = UR_BIT(1), ///< Device supports updating the local work-group size in command-buffer + ///< commands. + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE = UR_BIT(2), ///< Device supports updating the global work-group size in command-buffer + ///< commands. + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET = UR_BIT(3), ///< Device supports updating the global work offset in command-buffer + ///< commands. + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE = UR_BIT(4), ///< Device supports updating the kernel handle in command-buffer commands. + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_EVENTS = UR_BIT(5), ///< Device supports updating the event parameters in command-buffer + ///< commands. + /// @cond + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_FORCE_UINT32 = 0x7fffffff + /// @endcond + +} ur_device_command_buffer_update_capability_flag_t; +/// @brief Bit Mask for validating ur_device_command_buffer_update_capability_flags_t +#define UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAGS_MASK 0xffffffc0 + /////////////////////////////////////////////////////////////////////////////// /// @brief Command-buffer query information type typedef enum ur_exp_command_buffer_info_t { @@ -8132,6 +8235,10 @@ typedef enum ur_exp_command_buffer_info_t { ///< The reference count returned should be considered immediately stale. ///< It is unsuitable for general use in applications. This feature is ///< provided for identifying memory leaks. + UR_EXP_COMMAND_BUFFER_INFO_DESCRIPTOR = 1, ///< [::ur_exp_command_buffer_desc_t] Returns a ::ur_exp_command_buffer_desc_t + ///< with the properties of the command-buffer. Returned values may differ + ///< from those passed on construction if the property was ignored by the + ///< adapter. /// @cond UR_EXP_COMMAND_BUFFER_INFO_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -8178,7 +8285,7 @@ typedef struct ur_exp_command_buffer_update_memobj_arg_desc_t { ///< ::UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_MEMOBJ_ARG_DESC const void *pNext; ///< [in][optional] pointer to extension-specific structure uint32_t argIndex; ///< [in] Argument index. - const ur_kernel_arg_mem_obj_properties_t *pProperties; ///< [in][optinal] Pointer to memory object properties. + const ur_kernel_arg_mem_obj_properties_t *pProperties; ///< [in][optional] Pointer to memory object properties. ur_mem_handle_t hNewMemObjArg; ///< [in][optional] Handle of memory object to set at argument index. } ur_exp_command_buffer_update_memobj_arg_desc_t; @@ -8190,7 +8297,7 @@ typedef struct ur_exp_command_buffer_update_pointer_arg_desc_t { ///< ::UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC const void *pNext; ///< [in][optional] pointer to extension-specific structure uint32_t argIndex; ///< [in] Argument index. - const ur_kernel_arg_pointer_properties_t *pProperties; ///< [in][optinal] Pointer to USM pointer properties. + const ur_kernel_arg_pointer_properties_t *pProperties; ///< [in][optional] Pointer to USM pointer properties. const void *pNewPointerArg; ///< [in][optional] USM pointer to memory location holding the argument ///< value to set at argument index. @@ -8204,7 +8311,7 @@ typedef struct ur_exp_command_buffer_update_value_arg_desc_t { const void *pNext; ///< [in][optional] pointer to extension-specific structure uint32_t argIndex; ///< [in] Argument index. uint32_t argSize; ///< [in] Argument size. - const ur_kernel_arg_value_properties_t *pProperties; ///< [in][optinal] Pointer to value properties. + const ur_kernel_arg_value_properties_t *pProperties; ///< [in][optional] Pointer to value properties. const void *pNewValueArg; ///< [in][optional] Argument value representing matching kernel arg type to ///< set at argument index. @@ -8216,6 +8323,11 @@ typedef struct ur_exp_command_buffer_update_kernel_launch_desc_t { ur_structure_type_t stype; ///< [in] type of this structure, must be ///< ::UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC const void *pNext; ///< [in][optional] pointer to extension-specific structure + ur_kernel_handle_t hNewKernel; ///< [in][optional] The new kernel handle. If this parameter is nullptr, + ///< the current kernel handle in `hCommand` + ///< will be used. If a kernel handle is passed, it must be a valid kernel + ///< alternative as defined in + ///< ::urCommandBufferAppendKernelLaunchExp. uint32_t numNewMemObjArgs; ///< [in] Length of pNewMemObjArgList. uint32_t numNewPointerArgs; ///< [in] Length of pNewPointerArgList. uint32_t numNewValueArgs; ///< [in] Length of pNewValueArgList. @@ -8227,15 +8339,25 @@ typedef struct ur_exp_command_buffer_update_kernel_launch_desc_t { const ur_exp_command_buffer_update_value_arg_desc_t *pNewValueArgList; ///< [in][optional][range(0, numNewValueArgs)] An array describing the new ///< kernel value arguments for the command. size_t *pNewGlobalWorkOffset; ///< [in][optional][range(0, newWorkDim)] Array of newWorkDim unsigned - ///< values that describe the offset used to calculate the global ID. + ///< values that describe the offset used + ///< to calculate the global ID. If this parameter is nullptr, the current + ///< global work offset will be used. This parameter is required if + ///< `newWorkDim` is different from the current work dimensions + ///< in the command. size_t *pNewGlobalWorkSize; ///< [in][optional][range(0, newWorkDim)] Array of newWorkDim unsigned - ///< values that describe the number of global work-items. + ///< values that describe the number of + ///< global work-items. If this parameter is nullptr, the current global + ///< work size in `hCommand` will be used. + ///< This parameter is required if `newWorkDim` is different from the + ///< current work dimensions in the command. size_t *pNewLocalWorkSize; ///< [in][optional][range(0, newWorkDim)] Array of newWorkDim unsigned - ///< values that describe the number of work-items that make up a - ///< work-group. If newWorkDim is non-zero and pNewLocalWorkSize is - ///< nullptr, then runtime implementation will choose the work-group size. - ///< If newWorkDim is zero and pNewLocalWorkSize is nullptr, then the local - ///< work size is unchanged. + ///< values that describe the number of + ///< work-items that make up a work-group. If `pNewGlobalWorkSize` is set + ///< and `pNewLocalWorkSize` is nullptr, + ///< then the runtime implementation will choose the local work size. If + ///< `pNewGlobalWorkSize` is nullptr and + ///< `pNewLocalWorkSize` is nullptr, the current local work size in the + ///< command will be used. } ur_exp_command_buffer_update_kernel_launch_desc_t; @@ -8357,12 +8479,23 @@ urCommandBufferFinalizeExp( /// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION /// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE /// - ::UR_RESULT_ERROR_INVALID_VALUE +/// + `phKernelAlternatives == NULL && numKernelAlternatives > 0` +/// + `phKernelAlternatives != NULL && numKernelAlternatives == 0` +/// + If `phKernelAlternatives` contains `hKernel` /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +/// - ::UR_RESULT_ERROR_INVALID_OPERATION - "phCommand is not NULL and hCommandBuffer is not updatable." UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ur_exp_command_buffer_handle_t hCommandBuffer, ///< [in] Handle of the command-buffer object. @@ -8370,12 +8503,29 @@ urCommandBufferAppendKernelLaunchExp( uint32_t workDim, ///< [in] Dimension of the kernel execution. const size_t *pGlobalWorkOffset, ///< [in] Offset to use when executing kernel. const size_t *pGlobalWorkSize, ///< [in] Global work size to use when executing kernel. - const size_t *pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. + const size_t *pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. If this + ///< parameter is nullptr, then a local work size will be generated by the + ///< implementation. + uint32_t numKernelAlternatives, ///< [in] The number of kernel alternatives provided in + ///< phKernelAlternatives. + ur_kernel_handle_t *phKernelAlternatives, ///< [in][optional][range(0, numKernelAlternatives)] List of kernel handles + ///< that might be used to update the kernel in this + ///< command after the command-buffer is finalized. The default kernel + ///< `hKernel` is implicitly marked as an alternative. It's + ///< invalid to specify it as part of this list. uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t *pSyncPoint, ///< [out][optional] Sync point associated with this command. - ur_exp_command_buffer_command_handle_t *phCommand ///< [out][optional] Handle to this command. + ur_event_handle_t *phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t *phCommand ///< [out][optional] Handle to this command. Only available if the + ///< command-buffer is updatable. ); /////////////////////////////////////////////////////////////////////////////// @@ -8400,6 +8550,13 @@ urCommandBufferAppendKernelLaunchExp( /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES UR_APIEXPORT ur_result_t UR_APICALL @@ -8411,7 +8568,15 @@ urCommandBufferAppendUSMMemcpyExp( uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. - ur_exp_command_buffer_sync_point_t *pSyncPoint ///< [out][optional] Sync point associated with this command. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. + ur_exp_command_buffer_sync_point_t *pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t *phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t *phCommand ///< [out][optional] Handle to this command. ); /////////////////////////////////////////////////////////////////////////////// @@ -8438,6 +8603,13 @@ urCommandBufferAppendUSMMemcpyExp( /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES UR_APIEXPORT ur_result_t UR_APICALL @@ -8450,7 +8622,15 @@ urCommandBufferAppendUSMFillExp( uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. - ur_exp_command_buffer_sync_point_t *pSyncPoint ///< [out][optional] sync point associated with this command. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. + ur_exp_command_buffer_sync_point_t *pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t *phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t *phCommand ///< [out][optional] Handle to this command. ); /////////////////////////////////////////////////////////////////////////////// @@ -8471,6 +8651,13 @@ urCommandBufferAppendUSMFillExp( /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES UR_APIEXPORT ur_result_t UR_APICALL @@ -8484,7 +8671,15 @@ urCommandBufferAppendMemBufferCopyExp( uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. - ur_exp_command_buffer_sync_point_t *pSyncPoint ///< [out][optional] Sync point associated with this command. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. + ur_exp_command_buffer_sync_point_t *pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t *phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t *phCommand ///< [out][optional] Handle to this command. ); /////////////////////////////////////////////////////////////////////////////// @@ -8506,6 +8701,13 @@ urCommandBufferAppendMemBufferCopyExp( /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES UR_APIEXPORT ur_result_t UR_APICALL @@ -8518,7 +8720,15 @@ urCommandBufferAppendMemBufferWriteExp( uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. - ur_exp_command_buffer_sync_point_t *pSyncPoint ///< [out][optional] Sync point associated with this command. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. + ur_exp_command_buffer_sync_point_t *pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t *phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t *phCommand ///< [out][optional] Handle to this command. ); /////////////////////////////////////////////////////////////////////////////// @@ -8540,6 +8750,13 @@ urCommandBufferAppendMemBufferWriteExp( /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES UR_APIEXPORT ur_result_t UR_APICALL @@ -8552,7 +8769,15 @@ urCommandBufferAppendMemBufferReadExp( uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. - ur_exp_command_buffer_sync_point_t *pSyncPoint ///< [out][optional] Sync point associated with this command. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. + ur_exp_command_buffer_sync_point_t *pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t *phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t *phCommand ///< [out][optional] Handle to this command. ); /////////////////////////////////////////////////////////////////////////////// @@ -8573,6 +8798,13 @@ urCommandBufferAppendMemBufferReadExp( /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES UR_APIEXPORT ur_result_t UR_APICALL @@ -8590,7 +8822,15 @@ urCommandBufferAppendMemBufferCopyRectExp( uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. - ur_exp_command_buffer_sync_point_t *pSyncPoint ///< [out][optional] Sync point associated with this command. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. + ur_exp_command_buffer_sync_point_t *pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t *phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t *phCommand ///< [out][optional] Handle to this command. ); /////////////////////////////////////////////////////////////////////////////// @@ -8612,6 +8852,13 @@ urCommandBufferAppendMemBufferCopyRectExp( /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES UR_APIEXPORT ur_result_t UR_APICALL @@ -8632,7 +8879,15 @@ urCommandBufferAppendMemBufferWriteRectExp( uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. - ur_exp_command_buffer_sync_point_t *pSyncPoint ///< [out][optional] Sync point associated with this command. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. + ur_exp_command_buffer_sync_point_t *pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t *phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t *phCommand ///< [out][optional] Handle to this command. ); /////////////////////////////////////////////////////////////////////////////// @@ -8654,6 +8909,13 @@ urCommandBufferAppendMemBufferWriteRectExp( /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES UR_APIEXPORT ur_result_t UR_APICALL @@ -8673,7 +8935,15 @@ urCommandBufferAppendMemBufferReadRectExp( uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. - ur_exp_command_buffer_sync_point_t *pSyncPoint ///< [out][optional] Sync point associated with this command. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. + ur_exp_command_buffer_sync_point_t *pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t *phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t *phCommand ///< [out][optional] Handle to this command. ); /////////////////////////////////////////////////////////////////////////////// @@ -8697,6 +8967,13 @@ urCommandBufferAppendMemBufferReadRectExp( /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT /// - ::UR_RESULT_ERROR_INVALID_SIZE /// + If `offset + size` results in an out-of-bounds access. +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES UR_APIEXPORT ur_result_t UR_APICALL @@ -8710,7 +8987,15 @@ urCommandBufferAppendMemBufferFillExp( uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. - ur_exp_command_buffer_sync_point_t *pSyncPoint ///< [out][optional] sync point associated with this command. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. + ur_exp_command_buffer_sync_point_t *pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t *phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t *phCommand ///< [out][optional] Handle to this command. ); /////////////////////////////////////////////////////////////////////////////// @@ -8741,6 +9026,13 @@ urCommandBufferAppendMemBufferFillExp( /// - ::UR_RESULT_ERROR_INVALID_SIZE /// + `size == 0` /// + If `size` is higher than the allocation size of `pMemory` +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES UR_APIEXPORT ur_result_t UR_APICALL @@ -8752,7 +9044,15 @@ urCommandBufferAppendUSMPrefetchExp( uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. - ur_exp_command_buffer_sync_point_t *pSyncPoint ///< [out][optional] sync point associated with this command. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. + ur_exp_command_buffer_sync_point_t *pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t *phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t *phCommand ///< [out][optional] Handle to this command. ); /////////////////////////////////////////////////////////////////////////////// @@ -8783,6 +9083,13 @@ urCommandBufferAppendUSMPrefetchExp( /// - ::UR_RESULT_ERROR_INVALID_SIZE /// + `size == 0` /// + If `size` is higher than the allocation size of `pMemory` +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES UR_APIEXPORT ur_result_t UR_APICALL @@ -8794,7 +9101,15 @@ urCommandBufferAppendUSMAdviseExp( uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. - ur_exp_command_buffer_sync_point_t *pSyncPoint ///< [out][optional] sync point associated with this command. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. + ur_exp_command_buffer_sync_point_t *pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t *phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t *phCommand ///< [out][optional] Handle to this command. ); /////////////////////////////////////////////////////////////////////////////// @@ -8826,7 +9141,8 @@ urCommandBufferEnqueueExp( ///< events that must be complete before the command-buffer execution. ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< command-buffer execution instance. + ///< command-buffer execution instance. If phEventWaitList and phEvent are + ///< not NULL, phEvent must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -8867,9 +9183,10 @@ urCommandBufferReleaseCommandExp( ); /////////////////////////////////////////////////////////////////////////////// -/// @brief Update a kernel launch command in a finalized command-buffer. This -/// entry-point is synchronous and may block if the command-buffer is -/// executing when the entry-point is called. +/// @brief Update a kernel launch command in a finalized command-buffer. +/// +/// @details +/// This entry-point is synchronous and may block if the command-buffer is executing when the entry-point is called. /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -8885,18 +9202,17 @@ urCommandBufferReleaseCommandExp( /// - ::UR_RESULT_ERROR_INVALID_OPERATION /// + If ::ur_exp_command_buffer_desc_t::isUpdatable was not set to true on creation of the command buffer `hCommand` belongs to. /// + If the command-buffer `hCommand` belongs to has not been finalized. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and different from the work-dim used on creation of `hCommand`. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a non-NULL value and `pUpdateKernelLaunch->pNewGlobalWorkSize` is NULL. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a non-NULL value when `hCommand` was created with a NULL local work size. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a NULL value when `hCommand` was created with a non-NULL local work size. -/// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_COMMAND_HANDLE_EXP +/// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_COMMAND_HANDLE_EXP - "If `hCommand` is not a kernel execution command." /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT /// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX /// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION /// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION +/// + `pUpdateKernelLaunch->newWorkDim < 1 || pUpdateKernelLaunch->newWorkDim > 3` /// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE /// - ::UR_RESULT_ERROR_INVALID_VALUE +/// + If `pUpdateKernelLaunch->hNewKernel` was not passed to the `hKernel` or `phKernelAlternatives` parameters of ::urCommandBufferAppendKernelLaunchExp when this command was created. +/// + If `pUpdateKernelLaunch->newWorkDim` is different from the current workDim in `hCommand` and, pUpdateKernelLaunch->pNewGlobalWorkSize, or pUpdateKernelLaunch->pNewGlobalWorkOffset are nullptr. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES UR_APIEXPORT ur_result_t UR_APICALL @@ -8905,6 +9221,71 @@ urCommandBufferUpdateKernelLaunchExp( const ur_exp_command_buffer_update_kernel_launch_desc_t *pUpdateKernelLaunch ///< [in] Struct defining how the kernel command is to be updated. ); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Get a new event that will be signaled the next time the command in the +/// command-buffer executes. +/// +/// @details +/// It is the users responsibility to release the returned `phSignalEvent`. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hCommand` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == phSignalEvent` +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_EVENTS is not supported by the device associated with `hCommand`. +/// - ::UR_RESULT_ERROR_INVALID_OPERATION +/// + If ::ur_exp_command_buffer_desc_t::isUpdatable was not set to true on creation of the command buffer `hCommand` belongs to. +/// + If the command-buffer `hCommand` belongs to has not been finalized. +/// + If no `phEvent` parameter was set on creation of the command associated with `hCommand`. +/// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_COMMAND_HANDLE_EXP +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +UR_APIEXPORT ur_result_t UR_APICALL +urCommandBufferUpdateSignalEventExp( + ur_exp_command_buffer_command_handle_t hCommand, ///< [in] Handle of the command-buffer command to update. + ur_event_handle_t *phSignalEvent ///< [out] Event to be signaled. +); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Set the list of wait events for a command to depend on to a list of +/// new events. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hCommand` +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_EVENTS is not supported by the device associated with `hCommand`. +/// - ::UR_RESULT_ERROR_INVALID_OPERATION +/// + If ::ur_exp_command_buffer_desc_t::isUpdatable was not set to true on creation of the command buffer `hCommand` belongs to. +/// + If the command-buffer `hCommand` belongs to has not been finalized. +/// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_COMMAND_HANDLE_EXP +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// + If `numEventsInWaitList` does not match the number of wait events set when the command associated with `hCommand` was created. +/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +UR_APIEXPORT ur_result_t UR_APICALL +urCommandBufferUpdateWaitEventsExp( + ur_exp_command_buffer_command_handle_t hCommand, ///< [in] Handle of the command-buffer command to update. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t *phEventWaitList ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. +); + /////////////////////////////////////////////////////////////////////////////// /// @brief Get command-buffer object information. /// @@ -8916,7 +9297,7 @@ urCommandBufferUpdateKernelLaunchExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hCommandBuffer` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_EXP_COMMAND_BUFFER_INFO_REFERENCE_COUNT < propName` +/// + `::UR_EXP_COMMAND_BUFFER_INFO_DESCRIPTOR < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -8976,7 +9357,7 @@ urCommandBufferCommandGetInfoExp( #endif // Intel 'oneAPI' Unified Runtime Experimental APIs for Cooperative Kernels #if !defined(__GNUC__) -#pragma region cooperative kernels(experimental) +#pragma region cooperative_kernels_(experimental) #endif /////////////////////////////////////////////////////////////////////////////// #ifndef UR_COOPERATIVE_KERNELS_EXTENSION_STRING_EXP @@ -9025,15 +9406,14 @@ urEnqueueCooperativeKernelLaunchExp( const size_t *pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that ///< specify the number of local work-items forming a work-group that will ///< execute the kernel function. - ///< If nullptr, the runtime implementation will choose the work-group - ///< size. + ///< If nullptr, the runtime implementation will choose the work-group size. uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ); /////////////////////////////////////////////////////////////////////////////// @@ -9064,7 +9444,7 @@ urKernelSuggestMaxCooperativeGroupCountExp( #endif // Intel 'oneAPI' Unified Runtime Experimental APIs for enqueuing timestamp recordings #if !defined(__GNUC__) -#pragma region enqueue timestamp recording(experimental) +#pragma region enqueue_timestamp_recording_(experimental) #endif /////////////////////////////////////////////////////////////////////////////// /// @brief Enqueue a command for recording the device timestamp @@ -9090,15 +9470,16 @@ urEnqueueTimestampRecordingExp( uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait - ///< events. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t *phEvent ///< [in,out] return an event object that identifies this particular kernel ///< execution instance. Profiling information can be queried ///< from this event as if `hQueue` had profiling enabled. Querying ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` ///< reports the timestamp at the time of the call to this function. ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` - ///< reports the timestamp recorded when the command is executed on the device. + ///< reports the timestamp recorded when the command is executed on the + ///< device. If phEventWaitList and phEvent are not NULL, phEvent must not + ///< refer to an element of the phEventWaitList array. ); #if !defined(__GNUC__) @@ -9106,7 +9487,7 @@ urEnqueueTimestampRecordingExp( #endif // Intel 'oneAPI' Unified Runtime Experimental APIs for (kernel) Launch Properties #if !defined(__GNUC__) -#pragma region launch properties(experimental) +#pragma region launch_properties_(experimental) #endif /////////////////////////////////////////////////////////////////////////////// #ifndef UR_LAUNCH_PROPERTIES_EXTENSION_STRING_EXP @@ -9225,7 +9606,9 @@ urEnqueueKernelLaunchCustomExp( ///< events that must be complete before the kernel execution. If nullptr, ///< the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList + ///< array. ); #if !defined(__GNUC__) @@ -9233,7 +9616,7 @@ urEnqueueKernelLaunchCustomExp( #endif // Intel 'oneAPI' Unified Runtime Experimental APIs for multi-device compile #if !defined(__GNUC__) -#pragma region multi device compile(experimental) +#pragma region multi_device_compile_(experimental) #endif /////////////////////////////////////////////////////////////////////////////// #ifndef UR_MULTI_DEVICE_COMPILE_EXTENSION_STRING_EXP @@ -9363,7 +9746,7 @@ urProgramLinkExp( #endif // Intel 'oneAPI' USM Import/Release Extension APIs #if !defined(__GNUC__) -#pragma region usm import release(experimental) +#pragma region usm_import_release_(experimental) #endif /////////////////////////////////////////////////////////////////////////////// /// @brief Import memory into USM @@ -9416,7 +9799,7 @@ urUSMReleaseExp( #endif // Intel 'oneAPI' Unified Runtime Experimental APIs for USM P2P #if !defined(__GNUC__) -#pragma region usm p2p(experimental) +#pragma region usm_p2p_(experimental) #endif /////////////////////////////////////////////////////////////////////////////// #ifndef UR_USM_P2P_EXTENSION_STRING_EXP @@ -9572,7 +9955,7 @@ urUsmP2PPeerAccessGetInfoExp( #endif // Intel 'oneAPI' Unified Runtime Experimental API for enqueuing work through native APIs #if !defined(__GNUC__) -#pragma region native enqueue(experimental) +#pragma region native_enqueue_(experimental) #endif /////////////////////////////////////////////////////////////////////////////// /// @brief Native enqueue properties @@ -9637,7 +10020,8 @@ urEnqueueNativeCommandExp( ///< events that must be complete before the kernel execution. ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies the work that has - ///< been enqueued in nativeEnqueueFunc. + ///< been enqueued in nativeEnqueueFunc. If phEventWaitList and phEvent are + ///< not NULL, phEvent must not refer to an element of the phEventWaitList array. ); #if !defined(__GNUC__) @@ -9645,7 +10029,7 @@ urEnqueueNativeCommandExp( #endif // Intel 'oneAPI' Unified Runtime Experimental API for mapping tensor objects #if !defined(__GNUC__) -#pragma region tensor map(experimental) +#pragma region tensor_map_(experimental) #endif /////////////////////////////////////////////////////////////////////////////// /// @brief Handle of tensor map object @@ -11417,8 +11801,8 @@ typedef struct ur_bindless_images_import_external_memory_exp_params_t { ur_device_handle_t *phDevice; size_t *psize; ur_exp_external_mem_type_t *pmemHandleType; - ur_exp_interop_mem_desc_t **ppInteropMemDesc; - ur_exp_interop_mem_handle_t **pphInteropMem; + ur_exp_external_mem_desc_t **ppExternalMemDesc; + ur_exp_external_mem_handle_t **pphExternalMem; } ur_bindless_images_import_external_memory_exp_params_t; /////////////////////////////////////////////////////////////////////////////// @@ -11430,19 +11814,32 @@ typedef struct ur_bindless_images_map_external_array_exp_params_t { ur_device_handle_t *phDevice; const ur_image_format_t **ppImageFormat; const ur_image_desc_t **ppImageDesc; - ur_exp_interop_mem_handle_t *phInteropMem; + ur_exp_external_mem_handle_t *phExternalMem; ur_exp_image_mem_native_handle_t **pphImageMem; } ur_bindless_images_map_external_array_exp_params_t; /////////////////////////////////////////////////////////////////////////////// -/// @brief Function parameters for urBindlessImagesReleaseInteropExp +/// @brief Function parameters for urBindlessImagesMapExternalLinearMemoryExp /// @details Each entry is a pointer to the parameter passed to the function; /// allowing the callback the ability to modify the parameter's value -typedef struct ur_bindless_images_release_interop_exp_params_t { +typedef struct ur_bindless_images_map_external_linear_memory_exp_params_t { ur_context_handle_t *phContext; ur_device_handle_t *phDevice; - ur_exp_interop_mem_handle_t *phInteropMem; -} ur_bindless_images_release_interop_exp_params_t; + uint64_t *poffset; + uint64_t *psize; + ur_exp_external_mem_handle_t *phExternalMem; + void ***pppRetMem; +} ur_bindless_images_map_external_linear_memory_exp_params_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urBindlessImagesReleaseExternalMemoryExp +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_bindless_images_release_external_memory_exp_params_t { + ur_context_handle_t *phContext; + ur_device_handle_t *phDevice; + ur_exp_external_mem_handle_t *phExternalMem; +} ur_bindless_images_release_external_memory_exp_params_t; /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urBindlessImagesImportExternalSemaphoreExp @@ -11452,8 +11849,8 @@ typedef struct ur_bindless_images_import_external_semaphore_exp_params_t { ur_context_handle_t *phContext; ur_device_handle_t *phDevice; ur_exp_external_semaphore_type_t *psemHandleType; - ur_exp_interop_semaphore_desc_t **ppInteropSemaphoreDesc; - ur_exp_interop_semaphore_handle_t **pphInteropSemaphore; + ur_exp_external_semaphore_desc_t **ppExternalSemaphoreDesc; + ur_exp_external_semaphore_handle_t **pphExternalSemaphore; } ur_bindless_images_import_external_semaphore_exp_params_t; /////////////////////////////////////////////////////////////////////////////// @@ -11463,7 +11860,7 @@ typedef struct ur_bindless_images_import_external_semaphore_exp_params_t { typedef struct ur_bindless_images_release_external_semaphore_exp_params_t { ur_context_handle_t *phContext; ur_device_handle_t *phDevice; - ur_exp_interop_semaphore_handle_t *phInteropSemaphore; + ur_exp_external_semaphore_handle_t *phExternalSemaphore; } ur_bindless_images_release_external_semaphore_exp_params_t; /////////////////////////////////////////////////////////////////////////////// @@ -11472,7 +11869,7 @@ typedef struct ur_bindless_images_release_external_semaphore_exp_params_t { /// allowing the callback the ability to modify the parameter's value typedef struct ur_bindless_images_wait_external_semaphore_exp_params_t { ur_queue_handle_t *phQueue; - ur_exp_interop_semaphore_handle_t *phSemaphore; + ur_exp_external_semaphore_handle_t *phSemaphore; bool *phasWaitValue; uint64_t *pwaitValue; uint32_t *pnumEventsInWaitList; @@ -11486,7 +11883,7 @@ typedef struct ur_bindless_images_wait_external_semaphore_exp_params_t { /// allowing the callback the ability to modify the parameter's value typedef struct ur_bindless_images_signal_external_semaphore_exp_params_t { ur_queue_handle_t *phQueue; - ur_exp_interop_semaphore_handle_t *phSemaphore; + ur_exp_external_semaphore_handle_t *phSemaphore; bool *phasSignalValue; uint64_t *psignalValue; uint32_t *pnumEventsInWaitList; @@ -11673,9 +12070,14 @@ typedef struct ur_command_buffer_append_kernel_launch_exp_params_t { const size_t **ppGlobalWorkOffset; const size_t **ppGlobalWorkSize; const size_t **ppLocalWorkSize; + uint32_t *pnumKernelAlternatives; + ur_kernel_handle_t **pphKernelAlternatives; uint32_t *pnumSyncPointsInWaitList; const ur_exp_command_buffer_sync_point_t **ppSyncPointWaitList; + uint32_t *pnumEventsInWaitList; + const ur_event_handle_t **pphEventWaitList; ur_exp_command_buffer_sync_point_t **ppSyncPoint; + ur_event_handle_t **pphEvent; ur_exp_command_buffer_command_handle_t **pphCommand; } ur_command_buffer_append_kernel_launch_exp_params_t; @@ -11690,7 +12092,11 @@ typedef struct ur_command_buffer_append_usm_memcpy_exp_params_t { size_t *psize; uint32_t *pnumSyncPointsInWaitList; const ur_exp_command_buffer_sync_point_t **ppSyncPointWaitList; + uint32_t *pnumEventsInWaitList; + const ur_event_handle_t **pphEventWaitList; ur_exp_command_buffer_sync_point_t **ppSyncPoint; + ur_event_handle_t **pphEvent; + ur_exp_command_buffer_command_handle_t **pphCommand; } ur_command_buffer_append_usm_memcpy_exp_params_t; /////////////////////////////////////////////////////////////////////////////// @@ -11705,7 +12111,11 @@ typedef struct ur_command_buffer_append_usm_fill_exp_params_t { size_t *psize; uint32_t *pnumSyncPointsInWaitList; const ur_exp_command_buffer_sync_point_t **ppSyncPointWaitList; + uint32_t *pnumEventsInWaitList; + const ur_event_handle_t **pphEventWaitList; ur_exp_command_buffer_sync_point_t **ppSyncPoint; + ur_event_handle_t **pphEvent; + ur_exp_command_buffer_command_handle_t **pphCommand; } ur_command_buffer_append_usm_fill_exp_params_t; /////////////////////////////////////////////////////////////////////////////// @@ -11721,7 +12131,11 @@ typedef struct ur_command_buffer_append_mem_buffer_copy_exp_params_t { size_t *psize; uint32_t *pnumSyncPointsInWaitList; const ur_exp_command_buffer_sync_point_t **ppSyncPointWaitList; + uint32_t *pnumEventsInWaitList; + const ur_event_handle_t **pphEventWaitList; ur_exp_command_buffer_sync_point_t **ppSyncPoint; + ur_event_handle_t **pphEvent; + ur_exp_command_buffer_command_handle_t **pphCommand; } ur_command_buffer_append_mem_buffer_copy_exp_params_t; /////////////////////////////////////////////////////////////////////////////// @@ -11736,7 +12150,11 @@ typedef struct ur_command_buffer_append_mem_buffer_write_exp_params_t { const void **ppSrc; uint32_t *pnumSyncPointsInWaitList; const ur_exp_command_buffer_sync_point_t **ppSyncPointWaitList; + uint32_t *pnumEventsInWaitList; + const ur_event_handle_t **pphEventWaitList; ur_exp_command_buffer_sync_point_t **ppSyncPoint; + ur_event_handle_t **pphEvent; + ur_exp_command_buffer_command_handle_t **pphCommand; } ur_command_buffer_append_mem_buffer_write_exp_params_t; /////////////////////////////////////////////////////////////////////////////// @@ -11751,7 +12169,11 @@ typedef struct ur_command_buffer_append_mem_buffer_read_exp_params_t { void **ppDst; uint32_t *pnumSyncPointsInWaitList; const ur_exp_command_buffer_sync_point_t **ppSyncPointWaitList; + uint32_t *pnumEventsInWaitList; + const ur_event_handle_t **pphEventWaitList; ur_exp_command_buffer_sync_point_t **ppSyncPoint; + ur_event_handle_t **pphEvent; + ur_exp_command_buffer_command_handle_t **pphCommand; } ur_command_buffer_append_mem_buffer_read_exp_params_t; /////////////////////////////////////////////////////////////////////////////// @@ -11771,7 +12193,11 @@ typedef struct ur_command_buffer_append_mem_buffer_copy_rect_exp_params_t { size_t *pdstSlicePitch; uint32_t *pnumSyncPointsInWaitList; const ur_exp_command_buffer_sync_point_t **ppSyncPointWaitList; + uint32_t *pnumEventsInWaitList; + const ur_event_handle_t **pphEventWaitList; ur_exp_command_buffer_sync_point_t **ppSyncPoint; + ur_event_handle_t **pphEvent; + ur_exp_command_buffer_command_handle_t **pphCommand; } ur_command_buffer_append_mem_buffer_copy_rect_exp_params_t; /////////////////////////////////////////////////////////////////////////////// @@ -11791,7 +12217,11 @@ typedef struct ur_command_buffer_append_mem_buffer_write_rect_exp_params_t { void **ppSrc; uint32_t *pnumSyncPointsInWaitList; const ur_exp_command_buffer_sync_point_t **ppSyncPointWaitList; + uint32_t *pnumEventsInWaitList; + const ur_event_handle_t **pphEventWaitList; ur_exp_command_buffer_sync_point_t **ppSyncPoint; + ur_event_handle_t **pphEvent; + ur_exp_command_buffer_command_handle_t **pphCommand; } ur_command_buffer_append_mem_buffer_write_rect_exp_params_t; /////////////////////////////////////////////////////////////////////////////// @@ -11811,7 +12241,11 @@ typedef struct ur_command_buffer_append_mem_buffer_read_rect_exp_params_t { void **ppDst; uint32_t *pnumSyncPointsInWaitList; const ur_exp_command_buffer_sync_point_t **ppSyncPointWaitList; + uint32_t *pnumEventsInWaitList; + const ur_event_handle_t **pphEventWaitList; ur_exp_command_buffer_sync_point_t **ppSyncPoint; + ur_event_handle_t **pphEvent; + ur_exp_command_buffer_command_handle_t **pphCommand; } ur_command_buffer_append_mem_buffer_read_rect_exp_params_t; /////////////////////////////////////////////////////////////////////////////// @@ -11827,7 +12261,11 @@ typedef struct ur_command_buffer_append_mem_buffer_fill_exp_params_t { size_t *psize; uint32_t *pnumSyncPointsInWaitList; const ur_exp_command_buffer_sync_point_t **ppSyncPointWaitList; + uint32_t *pnumEventsInWaitList; + const ur_event_handle_t **pphEventWaitList; ur_exp_command_buffer_sync_point_t **ppSyncPoint; + ur_event_handle_t **pphEvent; + ur_exp_command_buffer_command_handle_t **pphCommand; } ur_command_buffer_append_mem_buffer_fill_exp_params_t; /////////////////////////////////////////////////////////////////////////////// @@ -11841,7 +12279,11 @@ typedef struct ur_command_buffer_append_usm_prefetch_exp_params_t { ur_usm_migration_flags_t *pflags; uint32_t *pnumSyncPointsInWaitList; const ur_exp_command_buffer_sync_point_t **ppSyncPointWaitList; + uint32_t *pnumEventsInWaitList; + const ur_event_handle_t **pphEventWaitList; ur_exp_command_buffer_sync_point_t **ppSyncPoint; + ur_event_handle_t **pphEvent; + ur_exp_command_buffer_command_handle_t **pphCommand; } ur_command_buffer_append_usm_prefetch_exp_params_t; /////////////////////////////////////////////////////////////////////////////// @@ -11855,7 +12297,11 @@ typedef struct ur_command_buffer_append_usm_advise_exp_params_t { ur_usm_advice_flags_t *padvice; uint32_t *pnumSyncPointsInWaitList; const ur_exp_command_buffer_sync_point_t **ppSyncPointWaitList; + uint32_t *pnumEventsInWaitList; + const ur_event_handle_t **pphEventWaitList; ur_exp_command_buffer_sync_point_t **ppSyncPoint; + ur_event_handle_t **pphEvent; + ur_exp_command_buffer_command_handle_t **pphCommand; } ur_command_buffer_append_usm_advise_exp_params_t; /////////////////////////////////////////////////////////////////////////////// @@ -11895,6 +12341,25 @@ typedef struct ur_command_buffer_update_kernel_launch_exp_params_t { const ur_exp_command_buffer_update_kernel_launch_desc_t **ppUpdateKernelLaunch; } ur_command_buffer_update_kernel_launch_exp_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urCommandBufferUpdateSignalEventExp +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_command_buffer_update_signal_event_exp_params_t { + ur_exp_command_buffer_command_handle_t *phCommand; + ur_event_handle_t **pphSignalEvent; +} ur_command_buffer_update_signal_event_exp_params_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urCommandBufferUpdateWaitEventsExp +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_command_buffer_update_wait_events_exp_params_t { + ur_exp_command_buffer_command_handle_t *phCommand; + uint32_t *pnumEventsInWaitList; + const ur_event_handle_t **pphEventWaitList; +} ur_command_buffer_update_wait_events_exp_params_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urCommandBufferGetInfoExp /// @details Each entry is a pointer to the parameter passed to the function; @@ -12174,7 +12639,7 @@ typedef struct ur_device_get_native_handle_params_t { /// allowing the callback the ability to modify the parameter's value typedef struct ur_device_create_with_native_handle_params_t { ur_native_handle_t *phNativeDevice; - ur_platform_handle_t *phPlatform; + ur_adapter_handle_t *phAdapter; const ur_device_native_properties_t **ppProperties; ur_device_handle_t **pphDevice; } ur_device_create_with_native_handle_params_t; diff --git a/include/ur_api_funcs.def b/include/ur_api_funcs.def new file mode 100644 index 0000000000..f88754ad60 --- /dev/null +++ b/include/ur_api_funcs.def @@ -0,0 +1,216 @@ + +/* + * + * Copyright (C) 2024 Intel Corporation + * + * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. + * See LICENSE.TXT + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * @file ur_api_funcs.def + * @version v0.11-r0 + * + */ + + // Auto-generated file, do not edit. + +_UR_API(urPlatformGet) +_UR_API(urPlatformGetInfo) +_UR_API(urPlatformGetNativeHandle) +_UR_API(urPlatformCreateWithNativeHandle) +_UR_API(urPlatformGetApiVersion) +_UR_API(urPlatformGetBackendOption) +_UR_API(urContextCreate) +_UR_API(urContextRetain) +_UR_API(urContextRelease) +_UR_API(urContextGetInfo) +_UR_API(urContextGetNativeHandle) +_UR_API(urContextCreateWithNativeHandle) +_UR_API(urContextSetExtendedDeleter) +_UR_API(urEventGetInfo) +_UR_API(urEventGetProfilingInfo) +_UR_API(urEventWait) +_UR_API(urEventRetain) +_UR_API(urEventRelease) +_UR_API(urEventGetNativeHandle) +_UR_API(urEventCreateWithNativeHandle) +_UR_API(urEventSetCallback) +_UR_API(urProgramCreateWithIL) +_UR_API(urProgramCreateWithBinary) +_UR_API(urProgramBuild) +_UR_API(urProgramCompile) +_UR_API(urProgramLink) +_UR_API(urProgramRetain) +_UR_API(urProgramRelease) +_UR_API(urProgramGetFunctionPointer) +_UR_API(urProgramGetGlobalVariablePointer) +_UR_API(urProgramGetInfo) +_UR_API(urProgramGetBuildInfo) +_UR_API(urProgramSetSpecializationConstants) +_UR_API(urProgramGetNativeHandle) +_UR_API(urProgramCreateWithNativeHandle) +_UR_API(urProgramBuildExp) +_UR_API(urProgramCompileExp) +_UR_API(urProgramLinkExp) +_UR_API(urKernelCreate) +_UR_API(urKernelGetInfo) +_UR_API(urKernelGetGroupInfo) +_UR_API(urKernelGetSubGroupInfo) +_UR_API(urKernelRetain) +_UR_API(urKernelRelease) +_UR_API(urKernelGetNativeHandle) +_UR_API(urKernelCreateWithNativeHandle) +_UR_API(urKernelGetSuggestedLocalWorkSize) +_UR_API(urKernelSetArgValue) +_UR_API(urKernelSetArgLocal) +_UR_API(urKernelSetArgPointer) +_UR_API(urKernelSetExecInfo) +_UR_API(urKernelSetArgSampler) +_UR_API(urKernelSetArgMemObj) +_UR_API(urKernelSetSpecializationConstants) +_UR_API(urKernelSuggestMaxCooperativeGroupCountExp) +_UR_API(urQueueGetInfo) +_UR_API(urQueueCreate) +_UR_API(urQueueRetain) +_UR_API(urQueueRelease) +_UR_API(urQueueGetNativeHandle) +_UR_API(urQueueCreateWithNativeHandle) +_UR_API(urQueueFinish) +_UR_API(urQueueFlush) +_UR_API(urSamplerCreate) +_UR_API(urSamplerRetain) +_UR_API(urSamplerRelease) +_UR_API(urSamplerGetInfo) +_UR_API(urSamplerGetNativeHandle) +_UR_API(urSamplerCreateWithNativeHandle) +_UR_API(urMemImageCreate) +_UR_API(urMemBufferCreate) +_UR_API(urMemRetain) +_UR_API(urMemRelease) +_UR_API(urMemBufferPartition) +_UR_API(urMemGetNativeHandle) +_UR_API(urMemBufferCreateWithNativeHandle) +_UR_API(urMemImageCreateWithNativeHandle) +_UR_API(urMemGetInfo) +_UR_API(urMemImageGetInfo) +_UR_API(urPhysicalMemCreate) +_UR_API(urPhysicalMemRetain) +_UR_API(urPhysicalMemRelease) +_UR_API(urAdapterGet) +_UR_API(urAdapterRelease) +_UR_API(urAdapterRetain) +_UR_API(urAdapterGetLastError) +_UR_API(urAdapterGetInfo) +_UR_API(urEnqueueKernelLaunch) +_UR_API(urEnqueueEventsWait) +_UR_API(urEnqueueEventsWaitWithBarrier) +_UR_API(urEnqueueMemBufferRead) +_UR_API(urEnqueueMemBufferWrite) +_UR_API(urEnqueueMemBufferReadRect) +_UR_API(urEnqueueMemBufferWriteRect) +_UR_API(urEnqueueMemBufferCopy) +_UR_API(urEnqueueMemBufferCopyRect) +_UR_API(urEnqueueMemBufferFill) +_UR_API(urEnqueueMemImageRead) +_UR_API(urEnqueueMemImageWrite) +_UR_API(urEnqueueMemImageCopy) +_UR_API(urEnqueueMemBufferMap) +_UR_API(urEnqueueMemUnmap) +_UR_API(urEnqueueUSMFill) +_UR_API(urEnqueueUSMMemcpy) +_UR_API(urEnqueueUSMPrefetch) +_UR_API(urEnqueueUSMAdvise) +_UR_API(urEnqueueUSMFill2D) +_UR_API(urEnqueueUSMMemcpy2D) +_UR_API(urEnqueueDeviceGlobalVariableWrite) +_UR_API(urEnqueueDeviceGlobalVariableRead) +_UR_API(urEnqueueReadHostPipe) +_UR_API(urEnqueueWriteHostPipe) +_UR_API(urEnqueueKernelLaunchCustomExp) +_UR_API(urEnqueueCooperativeKernelLaunchExp) +_UR_API(urEnqueueTimestampRecordingExp) +_UR_API(urEnqueueNativeCommandExp) +_UR_API(urBindlessImagesUnsampledImageHandleDestroyExp) +_UR_API(urBindlessImagesSampledImageHandleDestroyExp) +_UR_API(urBindlessImagesImageAllocateExp) +_UR_API(urBindlessImagesImageFreeExp) +_UR_API(urBindlessImagesUnsampledImageCreateExp) +_UR_API(urBindlessImagesSampledImageCreateExp) +_UR_API(urBindlessImagesImageCopyExp) +_UR_API(urBindlessImagesImageGetInfoExp) +_UR_API(urBindlessImagesMipmapGetLevelExp) +_UR_API(urBindlessImagesMipmapFreeExp) +_UR_API(urBindlessImagesImportExternalMemoryExp) +_UR_API(urBindlessImagesMapExternalArrayExp) +_UR_API(urBindlessImagesMapExternalLinearMemoryExp) +_UR_API(urBindlessImagesReleaseExternalMemoryExp) +_UR_API(urBindlessImagesImportExternalSemaphoreExp) +_UR_API(urBindlessImagesReleaseExternalSemaphoreExp) +_UR_API(urBindlessImagesWaitExternalSemaphoreExp) +_UR_API(urBindlessImagesSignalExternalSemaphoreExp) +_UR_API(urUSMHostAlloc) +_UR_API(urUSMDeviceAlloc) +_UR_API(urUSMSharedAlloc) +_UR_API(urUSMFree) +_UR_API(urUSMGetMemAllocInfo) +_UR_API(urUSMPoolCreate) +_UR_API(urUSMPoolRetain) +_UR_API(urUSMPoolRelease) +_UR_API(urUSMPoolGetInfo) +_UR_API(urUSMPitchedAllocExp) +_UR_API(urUSMImportExp) +_UR_API(urUSMReleaseExp) +_UR_API(urCommandBufferCreateExp) +_UR_API(urCommandBufferRetainExp) +_UR_API(urCommandBufferReleaseExp) +_UR_API(urCommandBufferFinalizeExp) +_UR_API(urCommandBufferAppendKernelLaunchExp) +_UR_API(urCommandBufferAppendUSMMemcpyExp) +_UR_API(urCommandBufferAppendUSMFillExp) +_UR_API(urCommandBufferAppendMemBufferCopyExp) +_UR_API(urCommandBufferAppendMemBufferWriteExp) +_UR_API(urCommandBufferAppendMemBufferReadExp) +_UR_API(urCommandBufferAppendMemBufferCopyRectExp) +_UR_API(urCommandBufferAppendMemBufferWriteRectExp) +_UR_API(urCommandBufferAppendMemBufferReadRectExp) +_UR_API(urCommandBufferAppendMemBufferFillExp) +_UR_API(urCommandBufferAppendUSMPrefetchExp) +_UR_API(urCommandBufferAppendUSMAdviseExp) +_UR_API(urCommandBufferEnqueueExp) +_UR_API(urCommandBufferRetainCommandExp) +_UR_API(urCommandBufferReleaseCommandExp) +_UR_API(urCommandBufferUpdateKernelLaunchExp) +_UR_API(urCommandBufferUpdateSignalEventExp) +_UR_API(urCommandBufferUpdateWaitEventsExp) +_UR_API(urCommandBufferGetInfoExp) +_UR_API(urCommandBufferCommandGetInfoExp) +_UR_API(urTensorMapEncodeIm2ColExp) +_UR_API(urTensorMapEncodeTiledExp) +_UR_API(urUsmP2PEnablePeerAccessExp) +_UR_API(urUsmP2PDisablePeerAccessExp) +_UR_API(urUsmP2PPeerAccessGetInfoExp) +_UR_API(urVirtualMemGranularityGetInfo) +_UR_API(urVirtualMemReserve) +_UR_API(urVirtualMemFree) +_UR_API(urVirtualMemMap) +_UR_API(urVirtualMemUnmap) +_UR_API(urVirtualMemSetAccess) +_UR_API(urVirtualMemGetInfo) +_UR_API(urDeviceGet) +_UR_API(urDeviceGetInfo) +_UR_API(urDeviceRetain) +_UR_API(urDeviceRelease) +_UR_API(urDevicePartition) +_UR_API(urDeviceSelectBinary) +_UR_API(urDeviceGetNativeHandle) +_UR_API(urDeviceCreateWithNativeHandle) +_UR_API(urDeviceGetGlobalTimestamps) +_UR_API(urLoaderConfigCreate) +_UR_API(urLoaderConfigEnableLayer) +_UR_API(urLoaderConfigGetInfo) +_UR_API(urLoaderConfigRelease) +_UR_API(urLoaderConfigRetain) +_UR_API(urLoaderConfigSetCodeLocationCallback) +_UR_API(urLoaderConfigSetMockingEnabled) +_UR_API(urLoaderInit) +_UR_API(urLoaderTearDown) diff --git a/include/ur_ddi.h b/include/ur_ddi.h index b7b79de03e..c1788cf582 100644 --- a/include/ur_ddi.h +++ b/include/ur_ddi.h @@ -7,7 +7,7 @@ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * * @file ur_ddi.h - * @version v0.10-r0 + * @version v0.11-r0 * */ #ifndef UR_DDI_H_INCLUDED @@ -1626,8 +1626,8 @@ typedef ur_result_t(UR_APICALL *ur_pfnBindlessImagesImportExternalMemoryExp_t)( ur_device_handle_t, size_t, ur_exp_external_mem_type_t, - ur_exp_interop_mem_desc_t *, - ur_exp_interop_mem_handle_t *); + ur_exp_external_mem_desc_t *, + ur_exp_external_mem_handle_t *); /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urBindlessImagesMapExternalArrayExp @@ -1636,15 +1636,25 @@ typedef ur_result_t(UR_APICALL *ur_pfnBindlessImagesMapExternalArrayExp_t)( ur_device_handle_t, const ur_image_format_t *, const ur_image_desc_t *, - ur_exp_interop_mem_handle_t, + ur_exp_external_mem_handle_t, ur_exp_image_mem_native_handle_t *); /////////////////////////////////////////////////////////////////////////////// -/// @brief Function-pointer for urBindlessImagesReleaseInteropExp -typedef ur_result_t(UR_APICALL *ur_pfnBindlessImagesReleaseInteropExp_t)( +/// @brief Function-pointer for urBindlessImagesMapExternalLinearMemoryExp +typedef ur_result_t(UR_APICALL *ur_pfnBindlessImagesMapExternalLinearMemoryExp_t)( ur_context_handle_t, ur_device_handle_t, - ur_exp_interop_mem_handle_t); + uint64_t, + uint64_t, + ur_exp_external_mem_handle_t, + void **); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urBindlessImagesReleaseExternalMemoryExp +typedef ur_result_t(UR_APICALL *ur_pfnBindlessImagesReleaseExternalMemoryExp_t)( + ur_context_handle_t, + ur_device_handle_t, + ur_exp_external_mem_handle_t); /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urBindlessImagesImportExternalSemaphoreExp @@ -1652,21 +1662,21 @@ typedef ur_result_t(UR_APICALL *ur_pfnBindlessImagesImportExternalSemaphoreExp_t ur_context_handle_t, ur_device_handle_t, ur_exp_external_semaphore_type_t, - ur_exp_interop_semaphore_desc_t *, - ur_exp_interop_semaphore_handle_t *); + ur_exp_external_semaphore_desc_t *, + ur_exp_external_semaphore_handle_t *); /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urBindlessImagesReleaseExternalSemaphoreExp typedef ur_result_t(UR_APICALL *ur_pfnBindlessImagesReleaseExternalSemaphoreExp_t)( ur_context_handle_t, ur_device_handle_t, - ur_exp_interop_semaphore_handle_t); + ur_exp_external_semaphore_handle_t); /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urBindlessImagesWaitExternalSemaphoreExp typedef ur_result_t(UR_APICALL *ur_pfnBindlessImagesWaitExternalSemaphoreExp_t)( ur_queue_handle_t, - ur_exp_interop_semaphore_handle_t, + ur_exp_external_semaphore_handle_t, bool, uint64_t, uint32_t, @@ -1677,7 +1687,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnBindlessImagesWaitExternalSemaphoreExp_t)( /// @brief Function-pointer for urBindlessImagesSignalExternalSemaphoreExp typedef ur_result_t(UR_APICALL *ur_pfnBindlessImagesSignalExternalSemaphoreExp_t)( ur_queue_handle_t, - ur_exp_interop_semaphore_handle_t, + ur_exp_external_semaphore_handle_t, bool, uint64_t, uint32_t, @@ -1699,7 +1709,8 @@ typedef struct ur_bindless_images_exp_dditable_t { ur_pfnBindlessImagesMipmapFreeExp_t pfnMipmapFreeExp; ur_pfnBindlessImagesImportExternalMemoryExp_t pfnImportExternalMemoryExp; ur_pfnBindlessImagesMapExternalArrayExp_t pfnMapExternalArrayExp; - ur_pfnBindlessImagesReleaseInteropExp_t pfnReleaseInteropExp; + ur_pfnBindlessImagesMapExternalLinearMemoryExp_t pfnMapExternalLinearMemoryExp; + ur_pfnBindlessImagesReleaseExternalMemoryExp_t pfnReleaseExternalMemoryExp; ur_pfnBindlessImagesImportExternalSemaphoreExp_t pfnImportExternalSemaphoreExp; ur_pfnBindlessImagesReleaseExternalSemaphoreExp_t pfnReleaseExternalSemaphoreExp; ur_pfnBindlessImagesWaitExternalSemaphoreExp_t pfnWaitExternalSemaphoreExp; @@ -1921,8 +1932,13 @@ typedef ur_result_t(UR_APICALL *ur_pfnCommandBufferAppendKernelLaunchExp_t)( const size_t *, const size_t *, uint32_t, + ur_kernel_handle_t *, + uint32_t, const ur_exp_command_buffer_sync_point_t *, + uint32_t, + const ur_event_handle_t *, ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, ur_exp_command_buffer_command_handle_t *); /////////////////////////////////////////////////////////////////////////////// @@ -1934,7 +1950,11 @@ typedef ur_result_t(UR_APICALL *ur_pfnCommandBufferAppendUSMMemcpyExp_t)( size_t, uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *); + uint32_t, + const ur_event_handle_t *, + ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, + ur_exp_command_buffer_command_handle_t *); /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urCommandBufferAppendUSMFillExp @@ -1946,7 +1966,11 @@ typedef ur_result_t(UR_APICALL *ur_pfnCommandBufferAppendUSMFillExp_t)( size_t, uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *); + uint32_t, + const ur_event_handle_t *, + ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, + ur_exp_command_buffer_command_handle_t *); /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urCommandBufferAppendMemBufferCopyExp @@ -1959,7 +1983,11 @@ typedef ur_result_t(UR_APICALL *ur_pfnCommandBufferAppendMemBufferCopyExp_t)( size_t, uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *); + uint32_t, + const ur_event_handle_t *, + ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, + ur_exp_command_buffer_command_handle_t *); /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urCommandBufferAppendMemBufferWriteExp @@ -1971,7 +1999,11 @@ typedef ur_result_t(UR_APICALL *ur_pfnCommandBufferAppendMemBufferWriteExp_t)( const void *, uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *); + uint32_t, + const ur_event_handle_t *, + ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, + ur_exp_command_buffer_command_handle_t *); /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urCommandBufferAppendMemBufferReadExp @@ -1983,7 +2015,11 @@ typedef ur_result_t(UR_APICALL *ur_pfnCommandBufferAppendMemBufferReadExp_t)( void *, uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *); + uint32_t, + const ur_event_handle_t *, + ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, + ur_exp_command_buffer_command_handle_t *); /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urCommandBufferAppendMemBufferCopyRectExp @@ -2000,7 +2036,11 @@ typedef ur_result_t(UR_APICALL *ur_pfnCommandBufferAppendMemBufferCopyRectExp_t) size_t, uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *); + uint32_t, + const ur_event_handle_t *, + ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, + ur_exp_command_buffer_command_handle_t *); /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urCommandBufferAppendMemBufferWriteRectExp @@ -2017,7 +2057,11 @@ typedef ur_result_t(UR_APICALL *ur_pfnCommandBufferAppendMemBufferWriteRectExp_t void *, uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *); + uint32_t, + const ur_event_handle_t *, + ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, + ur_exp_command_buffer_command_handle_t *); /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urCommandBufferAppendMemBufferReadRectExp @@ -2034,7 +2078,11 @@ typedef ur_result_t(UR_APICALL *ur_pfnCommandBufferAppendMemBufferReadRectExp_t) void *, uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *); + uint32_t, + const ur_event_handle_t *, + ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, + ur_exp_command_buffer_command_handle_t *); /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urCommandBufferAppendMemBufferFillExp @@ -2047,7 +2095,11 @@ typedef ur_result_t(UR_APICALL *ur_pfnCommandBufferAppendMemBufferFillExp_t)( size_t, uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *); + uint32_t, + const ur_event_handle_t *, + ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, + ur_exp_command_buffer_command_handle_t *); /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urCommandBufferAppendUSMPrefetchExp @@ -2058,7 +2110,11 @@ typedef ur_result_t(UR_APICALL *ur_pfnCommandBufferAppendUSMPrefetchExp_t)( ur_usm_migration_flags_t, uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *); + uint32_t, + const ur_event_handle_t *, + ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, + ur_exp_command_buffer_command_handle_t *); /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urCommandBufferAppendUSMAdviseExp @@ -2069,7 +2125,11 @@ typedef ur_result_t(UR_APICALL *ur_pfnCommandBufferAppendUSMAdviseExp_t)( ur_usm_advice_flags_t, uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *); + uint32_t, + const ur_event_handle_t *, + ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, + ur_exp_command_buffer_command_handle_t *); /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urCommandBufferEnqueueExp @@ -2096,6 +2156,19 @@ typedef ur_result_t(UR_APICALL *ur_pfnCommandBufferUpdateKernelLaunchExp_t)( ur_exp_command_buffer_command_handle_t, const ur_exp_command_buffer_update_kernel_launch_desc_t *); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urCommandBufferUpdateSignalEventExp +typedef ur_result_t(UR_APICALL *ur_pfnCommandBufferUpdateSignalEventExp_t)( + ur_exp_command_buffer_command_handle_t, + ur_event_handle_t *); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urCommandBufferUpdateWaitEventsExp +typedef ur_result_t(UR_APICALL *ur_pfnCommandBufferUpdateWaitEventsExp_t)( + ur_exp_command_buffer_command_handle_t, + uint32_t, + const ur_event_handle_t *); + /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urCommandBufferGetInfoExp typedef ur_result_t(UR_APICALL *ur_pfnCommandBufferGetInfoExp_t)( @@ -2137,6 +2210,8 @@ typedef struct ur_command_buffer_exp_dditable_t { ur_pfnCommandBufferRetainCommandExp_t pfnRetainCommandExp; ur_pfnCommandBufferReleaseCommandExp_t pfnReleaseCommandExp; ur_pfnCommandBufferUpdateKernelLaunchExp_t pfnUpdateKernelLaunchExp; + ur_pfnCommandBufferUpdateSignalEventExp_t pfnUpdateSignalEventExp; + ur_pfnCommandBufferUpdateWaitEventsExp_t pfnUpdateWaitEventsExp; ur_pfnCommandBufferGetInfoExp_t pfnGetInfoExp; ur_pfnCommandBufferCommandGetInfoExp_t pfnCommandGetInfoExp; } ur_command_buffer_exp_dditable_t; @@ -2427,7 +2502,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnDeviceGetNativeHandle_t)( /// @brief Function-pointer for urDeviceCreateWithNativeHandle typedef ur_result_t(UR_APICALL *ur_pfnDeviceCreateWithNativeHandle_t)( ur_native_handle_t, - ur_platform_handle_t, + ur_adapter_handle_t, const ur_device_native_properties_t *, ur_device_handle_t *); diff --git a/include/ur_print.h b/include/ur_print.h index d4c15084ed..5d7067bc26 100644 --- a/include/ur_print.h +++ b/include/ur_print.h @@ -947,20 +947,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintExpSamplerAddrModes(const struct ur_e UR_APIEXPORT ur_result_t UR_APICALL urPrintExpSamplerCubemapProperties(const struct ur_exp_sampler_cubemap_properties_t params, char *buffer, const size_t buff_size, size_t *out_size); /////////////////////////////////////////////////////////////////////////////// -/// @brief Print ur_exp_interop_mem_desc_t struct +/// @brief Print ur_exp_external_mem_desc_t struct /// @returns /// - ::UR_RESULT_SUCCESS /// - ::UR_RESULT_ERROR_INVALID_SIZE /// - `buff_size < out_size` -UR_APIEXPORT ur_result_t UR_APICALL urPrintExpInteropMemDesc(const struct ur_exp_interop_mem_desc_t params, char *buffer, const size_t buff_size, size_t *out_size); +UR_APIEXPORT ur_result_t UR_APICALL urPrintExpExternalMemDesc(const struct ur_exp_external_mem_desc_t params, char *buffer, const size_t buff_size, size_t *out_size); /////////////////////////////////////////////////////////////////////////////// -/// @brief Print ur_exp_interop_semaphore_desc_t struct +/// @brief Print ur_exp_external_semaphore_desc_t struct /// @returns /// - ::UR_RESULT_SUCCESS /// - ::UR_RESULT_ERROR_INVALID_SIZE /// - `buff_size < out_size` -UR_APIEXPORT ur_result_t UR_APICALL urPrintExpInteropSemaphoreDesc(const struct ur_exp_interop_semaphore_desc_t params, char *buffer, const size_t buff_size, size_t *out_size); +UR_APIEXPORT ur_result_t UR_APICALL urPrintExpExternalSemaphoreDesc(const struct ur_exp_external_semaphore_desc_t params, char *buffer, const size_t buff_size, size_t *out_size); /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_exp_image_copy_region_t struct @@ -970,6 +970,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintExpInteropSemaphoreDesc(const struct /// - `buff_size < out_size` UR_APIEXPORT ur_result_t UR_APICALL urPrintExpImageCopyRegion(const struct ur_exp_image_copy_region_t params, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_device_command_buffer_update_capability_flag_t enum +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintDeviceCommandBufferUpdateCapabilityFlags(enum ur_device_command_buffer_update_capability_flag_t value, char *buffer, const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_exp_command_buffer_info_t enum /// @returns @@ -2187,12 +2195,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintBindlessImagesImportExternalMemoryExp UR_APIEXPORT ur_result_t UR_APICALL urPrintBindlessImagesMapExternalArrayExpParams(const struct ur_bindless_images_map_external_array_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); /////////////////////////////////////////////////////////////////////////////// -/// @brief Print ur_bindless_images_release_interop_exp_params_t struct +/// @brief Print ur_bindless_images_map_external_linear_memory_exp_params_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintBindlessImagesMapExternalLinearMemoryExpParams(const struct ur_bindless_images_map_external_linear_memory_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_bindless_images_release_external_memory_exp_params_t struct /// @returns /// - ::UR_RESULT_SUCCESS /// - ::UR_RESULT_ERROR_INVALID_SIZE /// - `buff_size < out_size` -UR_APIEXPORT ur_result_t UR_APICALL urPrintBindlessImagesReleaseInteropExpParams(const struct ur_bindless_images_release_interop_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); +UR_APIEXPORT ur_result_t UR_APICALL urPrintBindlessImagesReleaseExternalMemoryExpParams(const struct ur_bindless_images_release_external_memory_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_bindless_images_import_external_semaphore_exp_params_t struct @@ -2482,6 +2498,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintCommandBufferReleaseCommandExpParams( /// - `buff_size < out_size` UR_APIEXPORT ur_result_t UR_APICALL urPrintCommandBufferUpdateKernelLaunchExpParams(const struct ur_command_buffer_update_kernel_launch_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_command_buffer_update_signal_event_exp_params_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintCommandBufferUpdateSignalEventExpParams(const struct ur_command_buffer_update_signal_event_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_command_buffer_update_wait_events_exp_params_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintCommandBufferUpdateWaitEventsExpParams(const struct ur_command_buffer_update_wait_events_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_command_buffer_get_info_exp_params_t struct /// @returns diff --git a/include/ur_print.hpp b/include/ur_print.hpp index e43e9f31dc..1b964b9226 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -7,7 +7,7 @@ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * * @file ur_print.hpp - * @version v0.10-r0 + * @version v0.11-r0 * */ #ifndef UR_PRINT_HPP @@ -47,9 +47,9 @@ struct is_handle : std::true_type {}; template <> struct is_handle : std::true_type {}; template <> -struct is_handle : std::true_type {}; +struct is_handle : std::true_type {}; template <> -struct is_handle : std::true_type {}; +struct is_handle : std::true_type {}; template <> struct is_handle : std::true_type {}; template <> @@ -199,6 +199,8 @@ inline ur_result_t printFlag(std::ostream &os, uint32_t template <> inline ur_result_t printFlag(std::ostream &os, uint32_t flag); +template <> +inline ur_result_t printFlag(std::ostream &os, uint32_t flag); template <> inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_exp_command_buffer_info_t value, size_t size); @@ -349,9 +351,10 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_sampler_mip_properties_t params); inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_sampler_addr_modes_t params); inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_sampler_cubemap_properties_t params); -inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_interop_mem_desc_t params); -inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_interop_semaphore_desc_t params); +inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_external_mem_desc_t params); +inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_external_semaphore_desc_t params); inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_image_copy_region_t params); +inline std::ostream &operator<<(std::ostream &os, enum ur_device_command_buffer_update_capability_flag_t value); inline std::ostream &operator<<(std::ostream &os, enum ur_exp_command_buffer_info_t value); inline std::ostream &operator<<(std::ostream &os, enum ur_exp_command_buffer_command_info_t value); inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_command_buffer_desc_t params); @@ -733,9 +736,6 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_EXP: os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_EXP"; break; - case UR_FUNCTION_COMMAND_BUFFER_ENQUEUE_EXP: - os << "UR_FUNCTION_COMMAND_BUFFER_ENQUEUE_EXP"; - break; case UR_FUNCTION_USM_PITCHED_ALLOC_EXP: os << "UR_FUNCTION_USM_PITCHED_ALLOC_EXP"; break; @@ -772,9 +772,6 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_ARRAY_EXP: os << "UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_ARRAY_EXP"; break; - case UR_FUNCTION_BINDLESS_IMAGES_RELEASE_INTEROP_EXP: - os << "UR_FUNCTION_BINDLESS_IMAGES_RELEASE_INTEROP_EXP"; - break; case UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_SEMAPHORE_EXP: os << "UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_SEMAPHORE_EXP"; break; @@ -883,39 +880,6 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_LOADER_TEAR_DOWN: os << "UR_FUNCTION_LOADER_TEAR_DOWN"; break; - case UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP: - os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP"; - break; - case UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_FILL_EXP: - os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_FILL_EXP"; - break; - case UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_EXP: - os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_EXP"; - break; - case UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_EXP: - os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_EXP"; - break; - case UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_EXP: - os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_EXP"; - break; - case UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_RECT_EXP: - os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_RECT_EXP"; - break; - case UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_RECT_EXP: - os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_RECT_EXP"; - break; - case UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_RECT_EXP: - os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_RECT_EXP"; - break; - case UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_FILL_EXP: - os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_FILL_EXP"; - break; - case UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_PREFETCH_EXP: - os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_PREFETCH_EXP"; - break; - case UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_ADVISE_EXP: - os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_ADVISE_EXP"; - break; case UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP: os << "UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP"; break; @@ -964,6 +928,54 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_LOADER_CONFIG_SET_MOCKING_ENABLED: os << "UR_FUNCTION_LOADER_CONFIG_SET_MOCKING_ENABLED"; break; + case UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP: + os << "UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP"; + break; + case UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP: + os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP"; + break; + case UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_FILL_EXP: + os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_FILL_EXP"; + break; + case UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_EXP: + os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_EXP"; + break; + case UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_EXP: + os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_EXP"; + break; + case UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_EXP: + os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_EXP"; + break; + case UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_RECT_EXP: + os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_RECT_EXP"; + break; + case UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_RECT_EXP: + os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_RECT_EXP"; + break; + case UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_RECT_EXP: + os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_RECT_EXP"; + break; + case UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_FILL_EXP: + os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_FILL_EXP"; + break; + case UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_PREFETCH_EXP: + os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_PREFETCH_EXP"; + break; + case UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_ADVISE_EXP: + os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_ADVISE_EXP"; + break; + case UR_FUNCTION_COMMAND_BUFFER_ENQUEUE_EXP: + os << "UR_FUNCTION_COMMAND_BUFFER_ENQUEUE_EXP"; + break; + case UR_FUNCTION_COMMAND_BUFFER_UPDATE_SIGNAL_EVENT_EXP: + os << "UR_FUNCTION_COMMAND_BUFFER_UPDATE_SIGNAL_EVENT_EXP"; + break; + case UR_FUNCTION_COMMAND_BUFFER_UPDATE_WAIT_EVENTS_EXP: + os << "UR_FUNCTION_COMMAND_BUFFER_UPDATE_WAIT_EVENTS_EXP"; + break; + case UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP: + os << "UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP"; + break; case UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP: os << "UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP"; break; @@ -1105,11 +1117,11 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_structure_type_t value case UR_STRUCTURE_TYPE_EXP_SAMPLER_MIP_PROPERTIES: os << "UR_STRUCTURE_TYPE_EXP_SAMPLER_MIP_PROPERTIES"; break; - case UR_STRUCTURE_TYPE_EXP_INTEROP_MEM_DESC: - os << "UR_STRUCTURE_TYPE_EXP_INTEROP_MEM_DESC"; + case UR_STRUCTURE_TYPE_EXP_EXTERNAL_MEM_DESC: + os << "UR_STRUCTURE_TYPE_EXP_EXTERNAL_MEM_DESC"; break; - case UR_STRUCTURE_TYPE_EXP_INTEROP_SEMAPHORE_DESC: - os << "UR_STRUCTURE_TYPE_EXP_INTEROP_SEMAPHORE_DESC"; + case UR_STRUCTURE_TYPE_EXP_EXTERNAL_SEMAPHORE_DESC: + os << "UR_STRUCTURE_TYPE_EXP_EXTERNAL_SEMAPHORE_DESC"; break; case UR_STRUCTURE_TYPE_EXP_FILE_DESCRIPTOR: os << "UR_STRUCTURE_TYPE_EXP_FILE_DESCRIPTOR"; @@ -1351,13 +1363,13 @@ inline ur_result_t printStruct(std::ostream &os, const void *ptr) { printPtr(os, pstruct); } break; - case UR_STRUCTURE_TYPE_EXP_INTEROP_MEM_DESC: { - const ur_exp_interop_mem_desc_t *pstruct = (const ur_exp_interop_mem_desc_t *)ptr; + case UR_STRUCTURE_TYPE_EXP_EXTERNAL_MEM_DESC: { + const ur_exp_external_mem_desc_t *pstruct = (const ur_exp_external_mem_desc_t *)ptr; printPtr(os, pstruct); } break; - case UR_STRUCTURE_TYPE_EXP_INTEROP_SEMAPHORE_DESC: { - const ur_exp_interop_semaphore_desc_t *pstruct = (const ur_exp_interop_semaphore_desc_t *)ptr; + case UR_STRUCTURE_TYPE_EXP_EXTERNAL_SEMAPHORE_DESC: { + const ur_exp_external_semaphore_desc_t *pstruct = (const ur_exp_external_semaphore_desc_t *)ptr; printPtr(os, pstruct); } break; @@ -1613,6 +1625,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_result_t value) { case UR_RESULT_ERROR_DEVICE_NOT_AVAILABLE: os << "UR_RESULT_ERROR_DEVICE_NOT_AVAILABLE"; break; + case UR_RESULT_ERROR_INVALID_SPEC_ID: + os << "UR_RESULT_ERROR_INVALID_SPEC_ID"; + break; case UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP: os << "UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP"; break; @@ -2566,8 +2581,11 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) { case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: os << "UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP"; break; - case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: - os << "UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP"; + case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP: + os << "UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP"; + break; + case UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP: + os << "UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP"; break; case UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP: os << "UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP"; @@ -2608,11 +2626,11 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) { case UR_DEVICE_INFO_MIPMAP_LEVEL_REFERENCE_SUPPORT_EXP: os << "UR_DEVICE_INFO_MIPMAP_LEVEL_REFERENCE_SUPPORT_EXP"; break; - case UR_DEVICE_INFO_INTEROP_MEMORY_IMPORT_SUPPORT_EXP: - os << "UR_DEVICE_INFO_INTEROP_MEMORY_IMPORT_SUPPORT_EXP"; + case UR_DEVICE_INFO_EXTERNAL_MEMORY_IMPORT_SUPPORT_EXP: + os << "UR_DEVICE_INFO_EXTERNAL_MEMORY_IMPORT_SUPPORT_EXP"; break; - case UR_DEVICE_INFO_INTEROP_SEMAPHORE_IMPORT_SUPPORT_EXP: - os << "UR_DEVICE_INFO_INTEROP_SEMAPHORE_IMPORT_SUPPORT_EXP"; + case UR_DEVICE_INFO_EXTERNAL_SEMAPHORE_IMPORT_SUPPORT_EXP: + os << "UR_DEVICE_INFO_EXTERNAL_SEMAPHORE_IMPORT_SUPPORT_EXP"; break; case UR_DEVICE_INFO_CUBEMAP_SUPPORT_EXP: os << "UR_DEVICE_INFO_CUBEMAP_SUPPORT_EXP"; @@ -4074,7 +4092,20 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_device_info os << ")"; } break; - case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: { + case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP: { + const ur_device_command_buffer_update_capability_flags_t *tptr = (const ur_device_command_buffer_update_capability_flags_t *)ptr; + if (sizeof(ur_device_command_buffer_update_capability_flags_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_device_command_buffer_update_capability_flags_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + ur::details::printFlag(os, + *tptr); + + os << ")"; + } break; + case UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP: { const ur_bool_t *tptr = (const ur_bool_t *)ptr; if (sizeof(ur_bool_t) > size) { os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")"; @@ -4242,7 +4273,7 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_device_info os << ")"; } break; - case UR_DEVICE_INFO_INTEROP_MEMORY_IMPORT_SUPPORT_EXP: { + case UR_DEVICE_INFO_EXTERNAL_MEMORY_IMPORT_SUPPORT_EXP: { const ur_bool_t *tptr = (const ur_bool_t *)ptr; if (sizeof(ur_bool_t) > size) { os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")"; @@ -4254,7 +4285,7 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_device_info os << ")"; } break; - case UR_DEVICE_INFO_INTEROP_SEMAPHORE_IMPORT_SUPPORT_EXP: { + case UR_DEVICE_INFO_EXTERNAL_SEMAPHORE_IMPORT_SUPPORT_EXP: { const ur_bool_t *tptr = (const ur_bool_t *)ptr; if (sizeof(ur_bool_t) > size) { os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")"; @@ -7526,8 +7557,8 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_program_info_t value) case UR_PROGRAM_INFO_DEVICES: os << "UR_PROGRAM_INFO_DEVICES"; break; - case UR_PROGRAM_INFO_SOURCE: - os << "UR_PROGRAM_INFO_SOURCE"; + case UR_PROGRAM_INFO_IL: + os << "UR_PROGRAM_INFO_IL"; break; case UR_PROGRAM_INFO_BINARY_SIZES: os << "UR_PROGRAM_INFO_BINARY_SIZES"; @@ -7609,7 +7640,7 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_program_inf } os << "}"; } break; - case UR_PROGRAM_INFO_SOURCE: { + case UR_PROGRAM_INFO_IL: { const char *tptr = (const char *)ptr; printPtr(os, tptr); @@ -8019,6 +8050,12 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_kernel_group_info_t va case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: os << "UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE"; break; + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE: + os << "UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE"; + break; + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE: + os << "UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE"; + break; default: os << "unknown enumerator"; break; @@ -8111,6 +8148,32 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_kernel_grou os << ")"; } break; + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE: { + + const size_t *tptr = (const size_t *)ptr; + os << "{"; + size_t nelems = size / sizeof(size_t); + for (size_t i = 0; i < nelems; ++i) { + if (i != 0) { + os << ", "; + } + + os << tptr[i]; + } + os << "}"; + } break; + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE: { + const size_t *tptr = (const size_t *)ptr; + if (sizeof(size_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(size_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; default: os << "unknown enumerator"; return UR_RESULT_ERROR_INVALID_ENUMERATION; @@ -8918,11 +8981,11 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_command_t value) { case UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP: os << "UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP"; break; - case UR_COMMAND_INTEROP_SEMAPHORE_WAIT_EXP: - os << "UR_COMMAND_INTEROP_SEMAPHORE_WAIT_EXP"; + case UR_COMMAND_EXTERNAL_SEMAPHORE_WAIT_EXP: + os << "UR_COMMAND_EXTERNAL_SEMAPHORE_WAIT_EXP"; break; - case UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP: - os << "UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP"; + case UR_COMMAND_EXTERNAL_SEMAPHORE_SIGNAL_EXP: + os << "UR_COMMAND_EXTERNAL_SEMAPHORE_SIGNAL_EXP"; break; case UR_COMMAND_TIMESTAMP_RECORDING_EXP: os << "UR_COMMAND_TIMESTAMP_RECORDING_EXP"; @@ -9619,11 +9682,11 @@ inline std::ostream &operator<<(std::ostream &os, const struct ur_exp_sampler_cu return os; } /////////////////////////////////////////////////////////////////////////////// -/// @brief Print operator for the ur_exp_interop_mem_desc_t type +/// @brief Print operator for the ur_exp_external_mem_desc_t type /// @returns /// std::ostream & -inline std::ostream &operator<<(std::ostream &os, const struct ur_exp_interop_mem_desc_t params) { - os << "(struct ur_exp_interop_mem_desc_t){"; +inline std::ostream &operator<<(std::ostream &os, const struct ur_exp_external_mem_desc_t params) { + os << "(struct ur_exp_external_mem_desc_t){"; os << ".stype = "; @@ -9639,11 +9702,11 @@ inline std::ostream &operator<<(std::ostream &os, const struct ur_exp_interop_me return os; } /////////////////////////////////////////////////////////////////////////////// -/// @brief Print operator for the ur_exp_interop_semaphore_desc_t type +/// @brief Print operator for the ur_exp_external_semaphore_desc_t type /// @returns /// std::ostream & -inline std::ostream &operator<<(std::ostream &os, const struct ur_exp_interop_semaphore_desc_t params) { - os << "(struct ur_exp_interop_semaphore_desc_t){"; +inline std::ostream &operator<<(std::ostream &os, const struct ur_exp_external_semaphore_desc_t params) { + os << "(struct ur_exp_external_semaphore_desc_t){"; os << ".stype = "; @@ -9694,6 +9757,116 @@ inline std::ostream &operator<<(std::ostream &os, const struct ur_exp_image_copy return os; } /////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_device_command_buffer_update_capability_flag_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, enum ur_device_command_buffer_update_capability_flag_t value) { + switch (value) { + case UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS: + os << "UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS"; + break; + case UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE: + os << "UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE"; + break; + case UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE: + os << "UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE"; + break; + case UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET: + os << "UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET"; + break; + case UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE: + os << "UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE"; + break; + case UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_EVENTS: + os << "UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_EVENTS"; + break; + default: + os << "unknown enumerator"; + break; + } + return os; +} + +namespace ur::details { +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_device_command_buffer_update_capability_flag_t flag +template <> +inline ur_result_t printFlag(std::ostream &os, uint32_t flag) { + uint32_t val = flag; + bool first = true; + + if ((val & UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS) == (uint32_t)UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS) { + val ^= (uint32_t)UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS; + if (!first) { + os << " | "; + } else { + first = false; + } + os << UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS; + } + + if ((val & UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE) == (uint32_t)UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE) { + val ^= (uint32_t)UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE; + if (!first) { + os << " | "; + } else { + first = false; + } + os << UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE; + } + + if ((val & UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE) == (uint32_t)UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE) { + val ^= (uint32_t)UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE; + if (!first) { + os << " | "; + } else { + first = false; + } + os << UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE; + } + + if ((val & UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET) == (uint32_t)UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET) { + val ^= (uint32_t)UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET; + if (!first) { + os << " | "; + } else { + first = false; + } + os << UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET; + } + + if ((val & UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE) == (uint32_t)UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE) { + val ^= (uint32_t)UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE; + if (!first) { + os << " | "; + } else { + first = false; + } + os << UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE; + } + + if ((val & UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_EVENTS) == (uint32_t)UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_EVENTS) { + val ^= (uint32_t)UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_EVENTS; + if (!first) { + os << " | "; + } else { + first = false; + } + os << UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_EVENTS; + } + if (val != 0) { + std::bitset<32> bits(val); + if (!first) { + os << " | "; + } + os << "unknown bit flags " << bits; + } else if (first) { + os << "0"; + } + return UR_RESULT_SUCCESS; +} +} // namespace ur::details +/////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_exp_command_buffer_info_t type /// @returns /// std::ostream & @@ -9702,6 +9875,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_exp_command_buffer_inf case UR_EXP_COMMAND_BUFFER_INFO_REFERENCE_COUNT: os << "UR_EXP_COMMAND_BUFFER_INFO_REFERENCE_COUNT"; break; + case UR_EXP_COMMAND_BUFFER_INFO_DESCRIPTOR: + os << "UR_EXP_COMMAND_BUFFER_INFO_DESCRIPTOR"; + break; default: os << "unknown enumerator"; break; @@ -9730,6 +9906,18 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_exp_command os << ")"; } break; + case UR_EXP_COMMAND_BUFFER_INFO_DESCRIPTOR: { + const ur_exp_command_buffer_desc_t *tptr = (const ur_exp_command_buffer_desc_t *)ptr; + if (sizeof(ur_exp_command_buffer_desc_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_exp_command_buffer_desc_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; default: os << "unknown enumerator"; return UR_RESULT_ERROR_INVALID_ENUMERATION; @@ -9946,6 +10134,12 @@ inline std::ostream &operator<<(std::ostream &os, const struct ur_exp_command_bu ur::details::printStruct(os, (params.pNext)); + os << ", "; + os << ".hNewKernel = "; + + ur::details::printPtr(os, + (params.hNewKernel)); + os << ", "; os << ".numNewMemObjArgs = "; @@ -15658,16 +15852,16 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct os << *(params->pmemHandleType); os << ", "; - os << ".pInteropMemDesc = "; + os << ".pExternalMemDesc = "; ur::details::printPtr(os, - *(params->ppInteropMemDesc)); + *(params->ppExternalMemDesc)); os << ", "; - os << ".phInteropMem = "; + os << ".phExternalMem = "; ur::details::printPtr(os, - *(params->pphInteropMem)); + *(params->pphExternalMem)); return os; } @@ -15702,10 +15896,10 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct *(params->ppImageDesc)); os << ", "; - os << ".hInteropMem = "; + os << ".hExternalMem = "; ur::details::printPtr(os, - *(params->phInteropMem)); + *(params->phExternalMem)); os << ", "; os << ".phImageMem = "; @@ -15717,10 +15911,52 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct } /////////////////////////////////////////////////////////////////////////////// -/// @brief Print operator for the ur_bindless_images_release_interop_exp_params_t type +/// @brief Print operator for the ur_bindless_images_map_external_linear_memory_exp_params_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_bindless_images_map_external_linear_memory_exp_params_t *params) { + + os << ".hContext = "; + + ur::details::printPtr(os, + *(params->phContext)); + + os << ", "; + os << ".hDevice = "; + + ur::details::printPtr(os, + *(params->phDevice)); + + os << ", "; + os << ".offset = "; + + os << *(params->poffset); + + os << ", "; + os << ".size = "; + + os << *(params->psize); + + os << ", "; + os << ".hExternalMem = "; + + ur::details::printPtr(os, + *(params->phExternalMem)); + + os << ", "; + os << ".ppRetMem = "; + + ur::details::printPtr(os, + *(params->pppRetMem)); + + return os; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_bindless_images_release_external_memory_exp_params_t type /// @returns /// std::ostream & -inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_bindless_images_release_interop_exp_params_t *params) { +inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_bindless_images_release_external_memory_exp_params_t *params) { os << ".hContext = "; @@ -15734,10 +15970,10 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct *(params->phDevice)); os << ", "; - os << ".hInteropMem = "; + os << ".hExternalMem = "; ur::details::printPtr(os, - *(params->phInteropMem)); + *(params->phExternalMem)); return os; } @@ -15765,16 +16001,16 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct os << *(params->psemHandleType); os << ", "; - os << ".pInteropSemaphoreDesc = "; + os << ".pExternalSemaphoreDesc = "; ur::details::printPtr(os, - *(params->ppInteropSemaphoreDesc)); + *(params->ppExternalSemaphoreDesc)); os << ", "; - os << ".phInteropSemaphore = "; + os << ".phExternalSemaphore = "; ur::details::printPtr(os, - *(params->pphInteropSemaphore)); + *(params->pphExternalSemaphore)); return os; } @@ -15797,10 +16033,10 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct *(params->phDevice)); os << ", "; - os << ".hInteropSemaphore = "; + os << ".hExternalSemaphore = "; ur::details::printPtr(os, - *(params->phInteropSemaphore)); + *(params->phExternalSemaphore)); return os; } @@ -16400,6 +16636,23 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur::details::printPtr(os, *(params->ppLocalWorkSize)); + os << ", "; + os << ".numKernelAlternatives = "; + + os << *(params->pnumKernelAlternatives); + + os << ", "; + os << ".phKernelAlternatives = {"; + for (size_t i = 0; *(params->pphKernelAlternatives) != NULL && i < *params->pnumKernelAlternatives; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphKernelAlternatives))[i]); + } + os << "}"; + os << ", "; os << ".numSyncPointsInWaitList = "; @@ -16412,13 +16665,36 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct *(params->ppSyncPointWaitList)); os << ", "; - os << ".pSyncPoint = "; + os << ".numEventsInWaitList = "; - ur::details::printPtr(os, - *(params->ppSyncPoint)); + os << *(params->pnumEventsInWaitList); os << ", "; - os << ".phCommand = "; + os << ".phEventWaitList = {"; + for (size_t i = 0; *(params->pphEventWaitList) != NULL && i < *params->pnumEventsInWaitList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphEventWaitList))[i]); + } + os << "}"; + + os << ", "; + os << ".pSyncPoint = "; + + ur::details::printPtr(os, + *(params->ppSyncPoint)); + + os << ", "; + os << ".phEvent = "; + + ur::details::printPtr(os, + *(params->pphEvent)); + + os << ", "; + os << ".phCommand = "; ur::details::printPtr(os, *(params->pphCommand)); @@ -16465,12 +16741,41 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur::details::printPtr(os, *(params->ppSyncPointWaitList)); + os << ", "; + os << ".numEventsInWaitList = "; + + os << *(params->pnumEventsInWaitList); + + os << ", "; + os << ".phEventWaitList = {"; + for (size_t i = 0; *(params->pphEventWaitList) != NULL && i < *params->pnumEventsInWaitList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphEventWaitList))[i]); + } + os << "}"; + os << ", "; os << ".pSyncPoint = "; ur::details::printPtr(os, *(params->ppSyncPoint)); + os << ", "; + os << ".phEvent = "; + + ur::details::printPtr(os, + *(params->pphEvent)); + + os << ", "; + os << ".phCommand = "; + + ur::details::printPtr(os, + *(params->pphCommand)); + return os; } @@ -16518,12 +16823,41 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur::details::printPtr(os, *(params->ppSyncPointWaitList)); + os << ", "; + os << ".numEventsInWaitList = "; + + os << *(params->pnumEventsInWaitList); + + os << ", "; + os << ".phEventWaitList = {"; + for (size_t i = 0; *(params->pphEventWaitList) != NULL && i < *params->pnumEventsInWaitList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphEventWaitList))[i]); + } + os << "}"; + os << ", "; os << ".pSyncPoint = "; ur::details::printPtr(os, *(params->ppSyncPoint)); + os << ", "; + os << ".phEvent = "; + + ur::details::printPtr(os, + *(params->pphEvent)); + + os << ", "; + os << ".phCommand = "; + + ur::details::printPtr(os, + *(params->pphCommand)); + return os; } @@ -16576,12 +16910,41 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur::details::printPtr(os, *(params->ppSyncPointWaitList)); + os << ", "; + os << ".numEventsInWaitList = "; + + os << *(params->pnumEventsInWaitList); + + os << ", "; + os << ".phEventWaitList = {"; + for (size_t i = 0; *(params->pphEventWaitList) != NULL && i < *params->pnumEventsInWaitList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphEventWaitList))[i]); + } + os << "}"; + os << ", "; os << ".pSyncPoint = "; ur::details::printPtr(os, *(params->ppSyncPoint)); + os << ", "; + os << ".phEvent = "; + + ur::details::printPtr(os, + *(params->pphEvent)); + + os << ", "; + os << ".phCommand = "; + + ur::details::printPtr(os, + *(params->pphCommand)); + return os; } @@ -16629,12 +16992,41 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur::details::printPtr(os, *(params->ppSyncPointWaitList)); + os << ", "; + os << ".numEventsInWaitList = "; + + os << *(params->pnumEventsInWaitList); + + os << ", "; + os << ".phEventWaitList = {"; + for (size_t i = 0; *(params->pphEventWaitList) != NULL && i < *params->pnumEventsInWaitList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphEventWaitList))[i]); + } + os << "}"; + os << ", "; os << ".pSyncPoint = "; ur::details::printPtr(os, *(params->ppSyncPoint)); + os << ", "; + os << ".phEvent = "; + + ur::details::printPtr(os, + *(params->pphEvent)); + + os << ", "; + os << ".phCommand = "; + + ur::details::printPtr(os, + *(params->pphCommand)); + return os; } @@ -16682,12 +17074,41 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur::details::printPtr(os, *(params->ppSyncPointWaitList)); + os << ", "; + os << ".numEventsInWaitList = "; + + os << *(params->pnumEventsInWaitList); + + os << ", "; + os << ".phEventWaitList = {"; + for (size_t i = 0; *(params->pphEventWaitList) != NULL && i < *params->pnumEventsInWaitList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphEventWaitList))[i]); + } + os << "}"; + os << ", "; os << ".pSyncPoint = "; ur::details::printPtr(os, *(params->ppSyncPoint)); + os << ", "; + os << ".phEvent = "; + + ur::details::printPtr(os, + *(params->pphEvent)); + + os << ", "; + os << ".phCommand = "; + + ur::details::printPtr(os, + *(params->pphCommand)); + return os; } @@ -16760,12 +17181,41 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur::details::printPtr(os, *(params->ppSyncPointWaitList)); + os << ", "; + os << ".numEventsInWaitList = "; + + os << *(params->pnumEventsInWaitList); + + os << ", "; + os << ".phEventWaitList = {"; + for (size_t i = 0; *(params->pphEventWaitList) != NULL && i < *params->pnumEventsInWaitList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphEventWaitList))[i]); + } + os << "}"; + os << ", "; os << ".pSyncPoint = "; ur::details::printPtr(os, *(params->ppSyncPoint)); + os << ", "; + os << ".phEvent = "; + + ur::details::printPtr(os, + *(params->pphEvent)); + + os << ", "; + os << ".phCommand = "; + + ur::details::printPtr(os, + *(params->pphCommand)); + return os; } @@ -16838,12 +17288,41 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur::details::printPtr(os, *(params->ppSyncPointWaitList)); + os << ", "; + os << ".numEventsInWaitList = "; + + os << *(params->pnumEventsInWaitList); + + os << ", "; + os << ".phEventWaitList = {"; + for (size_t i = 0; *(params->pphEventWaitList) != NULL && i < *params->pnumEventsInWaitList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphEventWaitList))[i]); + } + os << "}"; + os << ", "; os << ".pSyncPoint = "; ur::details::printPtr(os, *(params->ppSyncPoint)); + os << ", "; + os << ".phEvent = "; + + ur::details::printPtr(os, + *(params->pphEvent)); + + os << ", "; + os << ".phCommand = "; + + ur::details::printPtr(os, + *(params->pphCommand)); + return os; } @@ -16916,12 +17395,41 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur::details::printPtr(os, *(params->ppSyncPointWaitList)); + os << ", "; + os << ".numEventsInWaitList = "; + + os << *(params->pnumEventsInWaitList); + + os << ", "; + os << ".phEventWaitList = {"; + for (size_t i = 0; *(params->pphEventWaitList) != NULL && i < *params->pnumEventsInWaitList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphEventWaitList))[i]); + } + os << "}"; + os << ", "; os << ".pSyncPoint = "; ur::details::printPtr(os, *(params->ppSyncPoint)); + os << ", "; + os << ".phEvent = "; + + ur::details::printPtr(os, + *(params->pphEvent)); + + os << ", "; + os << ".phCommand = "; + + ur::details::printPtr(os, + *(params->pphCommand)); + return os; } @@ -16974,12 +17482,41 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur::details::printPtr(os, *(params->ppSyncPointWaitList)); + os << ", "; + os << ".numEventsInWaitList = "; + + os << *(params->pnumEventsInWaitList); + + os << ", "; + os << ".phEventWaitList = {"; + for (size_t i = 0; *(params->pphEventWaitList) != NULL && i < *params->pnumEventsInWaitList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphEventWaitList))[i]); + } + os << "}"; + os << ", "; os << ".pSyncPoint = "; ur::details::printPtr(os, *(params->ppSyncPoint)); + os << ", "; + os << ".phEvent = "; + + ur::details::printPtr(os, + *(params->pphEvent)); + + os << ", "; + os << ".phCommand = "; + + ur::details::printPtr(os, + *(params->pphCommand)); + return os; } @@ -17022,12 +17559,41 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur::details::printPtr(os, *(params->ppSyncPointWaitList)); + os << ", "; + os << ".numEventsInWaitList = "; + + os << *(params->pnumEventsInWaitList); + + os << ", "; + os << ".phEventWaitList = {"; + for (size_t i = 0; *(params->pphEventWaitList) != NULL && i < *params->pnumEventsInWaitList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphEventWaitList))[i]); + } + os << "}"; + os << ", "; os << ".pSyncPoint = "; ur::details::printPtr(os, *(params->ppSyncPoint)); + os << ", "; + os << ".phEvent = "; + + ur::details::printPtr(os, + *(params->pphEvent)); + + os << ", "; + os << ".phCommand = "; + + ur::details::printPtr(os, + *(params->pphCommand)); + return os; } @@ -17070,12 +17636,41 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur::details::printPtr(os, *(params->ppSyncPointWaitList)); + os << ", "; + os << ".numEventsInWaitList = "; + + os << *(params->pnumEventsInWaitList); + + os << ", "; + os << ".phEventWaitList = {"; + for (size_t i = 0; *(params->pphEventWaitList) != NULL && i < *params->pnumEventsInWaitList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphEventWaitList))[i]); + } + os << "}"; + os << ", "; os << ".pSyncPoint = "; ur::details::printPtr(os, *(params->ppSyncPoint)); + os << ", "; + os << ".phEvent = "; + + ur::details::printPtr(os, + *(params->pphEvent)); + + os << ", "; + os << ".phCommand = "; + + ur::details::printPtr(os, + *(params->pphCommand)); + return os; } @@ -17170,6 +17765,57 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct return os; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_command_buffer_update_signal_event_exp_params_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_command_buffer_update_signal_event_exp_params_t *params) { + + os << ".hCommand = "; + + ur::details::printPtr(os, + *(params->phCommand)); + + os << ", "; + os << ".phSignalEvent = "; + + ur::details::printPtr(os, + *(params->pphSignalEvent)); + + return os; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_command_buffer_update_wait_events_exp_params_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_command_buffer_update_wait_events_exp_params_t *params) { + + os << ".hCommand = "; + + ur::details::printPtr(os, + *(params->phCommand)); + + os << ", "; + os << ".numEventsInWaitList = "; + + os << *(params->pnumEventsInWaitList); + + os << ", "; + os << ".phEventWaitList = {"; + for (size_t i = 0; *(params->pphEventWaitList) != NULL && i < *params->pnumEventsInWaitList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphEventWaitList))[i]); + } + os << "}"; + + return os; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_command_buffer_get_info_exp_params_t type /// @returns @@ -18024,10 +18670,10 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct *(params->phNativeDevice))); os << ", "; - os << ".hPlatform = "; + os << ".hAdapter = "; ur::details::printPtr(os, - *(params->phPlatform)); + *(params->phAdapter)); os << ", "; os << ".pProperties = "; @@ -18070,6 +18716,11 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct return os; } +inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const ur_bool_t value) { + os << (value ? "true" : "false"); + return os; +} + namespace ur::details { /////////////////////////////////////////////////////////////////////////////// // @brief Print pointer value @@ -18516,8 +19167,11 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, ur_function_ case UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_ARRAY_EXP: { os << (const struct ur_bindless_images_map_external_array_exp_params_t *)params; } break; - case UR_FUNCTION_BINDLESS_IMAGES_RELEASE_INTEROP_EXP: { - os << (const struct ur_bindless_images_release_interop_exp_params_t *)params; + case UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP: { + os << (const struct ur_bindless_images_map_external_linear_memory_exp_params_t *)params; + } break; + case UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP: { + os << (const struct ur_bindless_images_release_external_memory_exp_params_t *)params; } break; case UR_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_SEMAPHORE_EXP: { os << (const struct ur_bindless_images_import_external_semaphore_exp_params_t *)params; @@ -18627,6 +19281,12 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, ur_function_ case UR_FUNCTION_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_EXP: { os << (const struct ur_command_buffer_update_kernel_launch_exp_params_t *)params; } break; + case UR_FUNCTION_COMMAND_BUFFER_UPDATE_SIGNAL_EVENT_EXP: { + os << (const struct ur_command_buffer_update_signal_event_exp_params_t *)params; + } break; + case UR_FUNCTION_COMMAND_BUFFER_UPDATE_WAIT_EVENTS_EXP: { + os << (const struct ur_command_buffer_update_wait_events_exp_params_t *)params; + } break; case UR_FUNCTION_COMMAND_BUFFER_GET_INFO_EXP: { os << (const struct ur_command_buffer_get_info_exp_params_t *)params; } break; diff --git a/scripts/Doxyfile b/scripts/Doxyfile index 64f0ac58a6..0da9dfb918 100644 --- a/scripts/Doxyfile +++ b/scripts/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "Intel One API Unified Runtime API" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = v0.10 +PROJECT_NUMBER = v0.11 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/scripts/benchmarks/README.md b/scripts/benchmarks/README.md index 170b647da4..64a7a3eeb9 100644 --- a/scripts/benchmarks/README.md +++ b/scripts/benchmarks/README.md @@ -9,12 +9,38 @@ Scripts for running performance tests on SYCL and Unified Runtime. ## Running -`$ ./main.py ~/benchmarks_workdir/ ~/llvm/build/` +`$ ./main.py ~/benchmarks_workdir/ ~/llvm/build/ ~/ur adapter_name` -This will download and build everything in `~/benchmarks_workdir/` using the compiler in `~/llvm/build/`, and then run the benchmarks. The results will be stored in `benchmark_results.md`. +This will download and build everything in `~/benchmarks_workdir/` using the compiler in `~/llvm/build/`, UR source from `~/ur` and then run the benchmarks for `adapter_name` adapter. The results will be stored in `benchmark_results.md`. The scripts will try to reuse the files stored in `~/benchmarks_workdir/`, but the benchmarks will be rebuilt every time. To avoid that, use `-no-rebuild` option. +## Running in CI + +The benchmarks scripts are used in a GitHub Actions worflow, and can be automatically executed on a preconfigured system against any Pull Request. + +![compute benchmarks](workflow.png "Compute Benchmarks CI job") + +To execute the benchmarks in CI, navigate to the `Actions` tab and then go to the `Compute Benchmarks` action. Here, you will find a list of previous runs and a "Run workflow" button. Upon clicking the button, you will be prompted to fill in a form to customize your benchmark run. The only mandatory field is the `PR number`, which is the identifier for the Pull Request against which you want the benchmarks to run. + +You can also include additional benchmark parameters, such as environment variables or filters. For a complete list of options, refer to `$ ./main.py --help`. + +Once all the required information is entered, click the "Run workflow" button to initiate a new workflow run. This will execute the benchmarks and then post the results as a comment on the specified Pull Request. + +By default, all benchmark runs are compared against `baseline`, which is a well-established set of the latest data. + +You must be a member of the `oneapi-src` organization to access these features. + +## Comparing results + +By default, the benchmark results are not stored. To store them, use the option `--save `. This will make the results available for comparison during the next benchmark runs. + +To compare a benchmark run with a previously stored result, use the option `--compare `. You can compare with more than one result. + +If no `--compare` option is specified, the benchmark run is compared against a previously stored `baseline`. This baseline is **not** automatically updated. To update it, use the `--save baseline` option. +The recommended way of updating the baseline is running the benchmarking +job on main after a merge of relevant changes. + ## Requirements ### Python diff --git a/scripts/benchmarks/benches/SobelFilter.py b/scripts/benchmarks/benches/SobelFilter.py index e976bfaee8..b9e7619e47 100644 --- a/scripts/benchmarks/benches/SobelFilter.py +++ b/scripts/benchmarks/benches/SobelFilter.py @@ -12,7 +12,10 @@ class SobelFilter(VelocityBase): def __init__(self, vb: VelocityBench): super().__init__("sobel_filter", "sobel_filter", vb) + + def download_deps(self): self.download_untar("sobel_filter", "https://github.com/oneapi-src/Velocity-Bench/raw/main/sobel_filter/res/sobel_filter_data.tgz?download=", "sobel_filter_data.tgz") + return def name(self): return "Velocity-Bench Sobel Filter" @@ -32,5 +35,5 @@ def parse_output(self, stdout: str) -> float: if match: return round(float(match.group(1)) * 1000, 3) else: - raise ValueError("Failed to parse benchmark output.") + raise ValueError("{self.__class__.__name__}: Failed to parse benchmark output.") diff --git a/scripts/benchmarks/benches/api_overhead.py b/scripts/benchmarks/benches/api_overhead.py deleted file mode 100644 index d34f4c4ee8..0000000000 --- a/scripts/benchmarks/benches/api_overhead.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. -# See LICENSE.TXT -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -import os -import csv -import io -from utils.utils import run, git_clone -from .base import Benchmark -from .result import Result -from .options import options - -class APIOverheadSYCL(Benchmark): - def __init__(self, directory): - super().__init__(directory) - - def name(self): - return "api_overhead_benchmark_sycl, mean execution time per 10 kernels" - - def unit(self): - return "μs" - - def setup(self): - repo_path = git_clone(self.directory, "compute-benchmarks-repo", "https://github.com/intel/compute-benchmarks.git", "0f758021dce9ba32341a503739b69db057433c59") - build_path = self.create_build_path('compute-benchmarks-build') - - configure_command = [ - "cmake", - f"-B {build_path}", - f"-S {repo_path}", - f"-DCMAKE_BUILD_TYPE=Release", - f"-DBUILD_SYCL=ON", - f"-DSYCL_COMPILER_ROOT={options.sycl}", - f"-DALLOW_WARNINGS=ON" - ] - run(configure_command, add_sycl=True) - - run(f"cmake --build {build_path} -j", add_sycl=True) - self.benchmark_bin = f"{build_path}/bin/api_overhead_benchmark_sycl" - - def run_internal(self, ioq, env_vars): - command = [ - f"{self.benchmark_bin}", - "--test=SubmitKernel", - f"--Ioq={ioq}", - "--DiscardEvents=0", - "--MeasureCompletion=0", - "--iterations=100000", - "--Profiling=0", - "--NumKernels=10", - "--KernelExecTime=1", - "--csv", - "--noHeaders" - ] - result = self.run_bench(command, env_vars) - (label, mean) = self.parse_output(result) - return Result(label=label, value=mean, command=command, env=env_vars, stdout=result) - - def run(self, env_vars) -> list[Result]: - results = [] - for ioq in [0, 1]: - results.append(self.run_internal(ioq, env_vars)) - - return results - - def parse_output(self, output): - csv_file = io.StringIO(output) - reader = csv.reader(csv_file) - next(reader, None) - data_row = next(reader, None) - if data_row is None: - raise ValueError("Benchmark output does not contain data.") - try: - label = data_row[0] - mean = float(data_row[1]) - return (label, mean) - except (ValueError, IndexError) as e: - raise ValueError(f"Error parsing output: {e}") - - def teardown(self): - return diff --git a/scripts/benchmarks/benches/base.py b/scripts/benchmarks/benches/base.py index 25b5d2619f..36f252cb42 100644 --- a/scripts/benchmarks/benches/base.py +++ b/scripts/benchmarks/benches/base.py @@ -6,7 +6,6 @@ import os import shutil from pathlib import Path -import subprocess # nosec B404 from .result import Result from .options import options from utils.utils import run @@ -17,18 +16,26 @@ class Benchmark: def __init__(self, directory): self.directory = directory - def run_bench(self, command, env_vars): - return run(command=command, env_vars=env_vars, add_sycl=True, cwd=options.benchmark_cwd).stdout.decode() - - def create_build_path(self, name): - build_path = os.path.join(self.directory, name) - - if options.rebuild and Path(build_path).exists(): - shutil.rmtree(build_path) + @staticmethod + def get_adapter_full_path(): + for libs_dir_name in ['lib', 'lib64']: + adapter_path = os.path.join( + options.ur_dir, libs_dir_name, f"libur_adapter_{options.ur_adapter_name}.so") + if os.path.isfile(adapter_path): + return adapter_path + assert False, \ + f"could not find adapter file {adapter_path} (and in similar lib paths)" - Path(build_path).mkdir(parents=True, exist_ok=True) - - return build_path + def run_bench(self, command, env_vars): + env_vars_with_forced_adapter = env_vars.copy() + env_vars_with_forced_adapter.update( + {'UR_ADAPTERS_FORCE_LOAD': Benchmark.get_adapter_full_path()}) + return run( + command=command, + env_vars=env_vars_with_forced_adapter, + add_sycl=True, + cwd=options.benchmark_cwd + ).stdout.decode() def create_data_path(self, name): data_path = os.path.join(self.directory, "data", name) @@ -58,10 +65,13 @@ def name(self): def unit(self): raise NotImplementedError() + def lower_is_better(self): + return True + def setup(self): raise NotImplementedError() - def run(self, env_vars): + def run(self, env_vars) -> list[Result]: raise NotImplementedError() def teardown(self): diff --git a/scripts/benchmarks/benches/bitcracker.py b/scripts/benchmarks/benches/bitcracker.py index 4b2f2aba4f..bb198433fa 100644 --- a/scripts/benchmarks/benches/bitcracker.py +++ b/scripts/benchmarks/benches/bitcracker.py @@ -31,4 +31,4 @@ def parse_output(self, stdout: str) -> float: if match: return float(match.group(1)) else: - raise ValueError("Failed to parse benchmark output.") + raise ValueError("{self.__class__.__name__}: Failed to parse benchmark output.") diff --git a/scripts/benchmarks/benches/compute.py b/scripts/benchmarks/benches/compute.py new file mode 100644 index 0000000000..473db80a75 --- /dev/null +++ b/scripts/benchmarks/benches/compute.py @@ -0,0 +1,233 @@ +# Copyright (C) 2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import os +import csv +import io +from utils.utils import run, git_clone, create_build_path +from .base import Benchmark +from .result import Result +from .options import options + +class ComputeBench: + def __init__(self, directory): + self.directory = directory + self.built = False + + def setup(self): + if self.built: + return + + repo_path = git_clone(self.directory, "compute-benchmarks-repo", "https://github.com/intel/compute-benchmarks.git", "08c41bb8bc1762ad53c6194df6d36bfcceff4aa2") + build_path = create_build_path(self.directory, 'compute-benchmarks-build') + + configure_command = [ + "cmake", + f"-B {build_path}", + f"-S {repo_path}", + f"-DCMAKE_BUILD_TYPE=Release", + f"-DBUILD_SYCL=ON", + f"-DSYCL_COMPILER_ROOT={options.sycl}", + f"-DALLOW_WARNINGS=ON", + f"-DBUILD_UR=ON", + f"-Dunified-runtime_DIR={options.ur_dir}/lib/cmake/unified-runtime", + ] + + print(f"{self.__class__.__name__}: Run {configure_command}") + run(configure_command, add_sycl=True) + print(f"{self.__class__.__name__}: Run cmake --build {build_path} -j") + run(f"cmake --build {build_path} -j", add_sycl=True) + + self.built = True + +class ComputeBenchmark(Benchmark): + def __init__(self, bench, name, test): + self.bench = bench + self.bench_name = name + self.test = test + super().__init__(bench.directory) + + def bin_args(self) -> list[str]: + return [] + + def extra_env_vars(self) -> dict: + return {} + + def unit(self): + return "μs" + + def setup(self): + self.benchmark_bin = os.path.join(self.bench.directory, 'compute-benchmarks-build', 'bin', self.bench_name) + self.bench.setup() + + def run(self, env_vars) -> list[Result]: + command = [ + f"{self.benchmark_bin}", + f"--test={self.test}", + "--csv", + "--noHeaders" + ] + + command += self.bin_args() + env_vars.update(self.extra_env_vars()) + + result = self.run_bench(command, env_vars) + (label, mean) = self.parse_output(result) + return [ Result(label=self.name(), value=mean, command=command, env=env_vars, stdout=result, lower_is_better=self.lower_is_better()) ] + + def parse_output(self, output): + csv_file = io.StringIO(output) + reader = csv.reader(csv_file) + next(reader, None) + data_row = next(reader, None) + if data_row is None: + raise ValueError("Benchmark output does not contain data.") + try: + label = data_row[0] + mean = float(data_row[1]) + return (label, mean) + except (ValueError, IndexError) as e: + raise ValueError(f"Error parsing output: {e}") + + def teardown(self): + return + +class SubmitKernelSYCL(ComputeBenchmark): + def __init__(self, bench, ioq): + self.ioq = ioq + super().__init__(bench, "api_overhead_benchmark_sycl", "SubmitKernel") + + def name(self): + order = "in order" if self.ioq else "out of order" + return f"api_overhead_benchmark_sycl SubmitKernel {order}" + + def bin_args(self) -> list[str]: + return [ + f"--Ioq={self.ioq}", + "--DiscardEvents=0", + "--MeasureCompletion=0", + "--iterations=100000", + "--Profiling=0", + "--NumKernels=10", + "--KernelExecTime=1" + ] + +class SubmitKernelUR(ComputeBenchmark): + def __init__(self, bench, ioq): + self.ioq = ioq + super().__init__(bench, "api_overhead_benchmark_ur", "SubmitKernel") + + def name(self): + order = "in order" if self.ioq else "out of order" + return f"api_overhead_benchmark_ur SubmitKernel {order}" + + def bin_args(self) -> list[str]: + return [ + f"--Ioq={self.ioq}", + "--DiscardEvents=0", + "--MeasureCompletion=0", + "--iterations=100000", + "--Profiling=0", + "--NumKernels=10", + "--KernelExecTime=1" + ] + +class ExecImmediateCopyQueue(ComputeBenchmark): + def __init__(self, bench, ioq, isCopyOnly, source, destination, size): + self.ioq = ioq + self.isCopyOnly = isCopyOnly + self.source = source + self.destination = destination + self.size = size + super().__init__(bench, "api_overhead_benchmark_sycl", "ExecImmediateCopyQueue") + + def name(self): + order = "in order" if self.ioq else "out of order" + return f"api_overhead_benchmark_sycl ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}" + + def bin_args(self) -> list[str]: + return [ + "--iterations=100000", + f"--ioq={self.ioq}", + f"--IsCopyOnly={self.isCopyOnly}", + "--MeasureCompletionTime=0", + f"--src={self.destination}", + f"--dst={self.destination}", + f"--size={self.size}" + ] + +class QueueInOrderMemcpy(ComputeBenchmark): + def __init__(self, bench, isCopyOnly, source, destination, size): + self.isCopyOnly = isCopyOnly + self.source = source + self.destination = destination + self.size = size + super().__init__(bench, "memory_benchmark_sycl", "QueueInOrderMemcpy") + + def name(self): + return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}" + + def bin_args(self) -> list[str]: + return [ + "--iterations=10000", + f"--IsCopyOnly={self.isCopyOnly}", + f"--sourcePlacement={self.source}", + f"--destinationPlacement={self.destination}", + f"--size={self.size}", + "--count=100" + ] + +class QueueMemcpy(ComputeBenchmark): + def __init__(self, bench, source, destination, size): + self.source = source + self.destination = destination + self.size = size + super().__init__(bench, "memory_benchmark_sycl", "QueueMemcpy") + + def name(self): + return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}" + + def bin_args(self) -> list[str]: + return [ + "--iterations=10000", + f"--sourcePlacement={self.source}", + f"--destinationPlacement={self.destination}", + f"--size={self.size}", + ] + +class StreamMemory(ComputeBenchmark): + def __init__(self, bench, type, size, placement): + self.type = type + self.size = size + self.placement = placement + super().__init__(bench, "memory_benchmark_sycl", "StreamMemory") + + def name(self): + return f"memory_benchmark_sycl StreamMemory, placement {self.placement}, type {self.type}, size {self.size}" + + def bin_args(self) -> list[str]: + return [ + "--iterations=10000", + f"--type={self.type}", + f"--size={self.size}", + f"--memoryPlacement={self.placement}", + "--useEvents=0", + "--contents=Zeros", + ] + +class VectorSum(ComputeBenchmark): + def __init__(self, bench): + super().__init__(bench, "miscellaneous_benchmark_sycl", "VectorSum") + + def name(self): + return f"miscellaneous_benchmark_sycl VectorSum" + + def bin_args(self) -> list[str]: + return [ + "--iterations=1000", + "--numberOfElementsX=512", + "--numberOfElementsY=256", + "--numberOfElementsZ=256", + ] diff --git a/scripts/benchmarks/benches/cudaSift.py b/scripts/benchmarks/benches/cudaSift.py index 6f9c19040e..482d258052 100644 --- a/scripts/benchmarks/benches/cudaSift.py +++ b/scripts/benchmarks/benches/cudaSift.py @@ -9,11 +9,18 @@ from utils.utils import run import os import re +import shutil class CudaSift(VelocityBase): def __init__(self, vb: VelocityBench): super().__init__("cudaSift", "cudaSift", vb) + def download_deps(self): + images = os.path.join(self.vb.repo_path, self.bench_name, 'inputData') + dest = os.path.join(self.directory, 'inputData') + if not os.path.exists(dest): + shutil.copytree(images, dest) + def name(self): return "Velocity-Bench CudaSift" diff --git a/scripts/benchmarks/benches/easywave.py b/scripts/benchmarks/benches/easywave.py index 2fa4d95685..2f89482329 100644 --- a/scripts/benchmarks/benches/easywave.py +++ b/scripts/benchmarks/benches/easywave.py @@ -14,6 +14,8 @@ class Easywave(VelocityBase): def __init__(self, vb: VelocityBench): super().__init__("easywave", "easyWave_sycl", vb) + + def download_deps(self): self.download_untar("easywave", "https://git.gfz-potsdam.de/id2/geoperil/easyWave/-/raw/master/data/examples.tar.gz", "examples.tar.gz") def name(self): diff --git a/scripts/benchmarks/benches/hashtable.py b/scripts/benchmarks/benches/hashtable.py index c8cb0bdb03..c5ed397dbb 100644 --- a/scripts/benchmarks/benches/hashtable.py +++ b/scripts/benchmarks/benches/hashtable.py @@ -23,9 +23,12 @@ def unit(self): def bin_args(self) -> list[str]: return ["--no-verify"] + def lower_is_better(self): + return False + def parse_output(self, stdout: str) -> float: match = re.search(r'(\d+\.\d+) million keys/second', stdout) if match: return float(match.group(1)) else: - raise ValueError("Failed to parse keys per second from benchmark output.") + raise ValueError("{self.__class__.__name__}: Failed to parse keys per second from benchmark output.") diff --git a/scripts/benchmarks/benches/options.py b/scripts/benchmarks/benches/options.py index c990a44d5f..c035ce6800 100644 --- a/scripts/benchmarks/benches/options.py +++ b/scripts/benchmarks/benches/options.py @@ -5,6 +5,9 @@ class Options: sycl: str = "" rebuild: bool = True benchmark_cwd: str = "INVALID" + timeout: float = 600 + iterations: int = 5 + verbose: bool = False options = Options() diff --git a/scripts/benchmarks/benches/quicksilver.py b/scripts/benchmarks/benches/quicksilver.py index 383c8dd5be..b7600d11be 100644 --- a/scripts/benchmarks/benches/quicksilver.py +++ b/scripts/benchmarks/benches/quicksilver.py @@ -18,7 +18,7 @@ def __init__(self, vb: VelocityBench): def run(self, env_vars) -> list[Result]: # TODO: fix the crash in QuickSilver when UR_L0_USE_IMMEDIATE_COMMANDLISTS=0 if 'UR_L0_USE_IMMEDIATE_COMMANDLISTS' in env_vars and env_vars['UR_L0_USE_IMMEDIATE_COMMANDLISTS'] == '0': - return [] + return None return super().run(env_vars) @@ -28,6 +28,9 @@ def name(self): def unit(self): return "MMS/CTT" + def lower_is_better(self): + return False + def bin_args(self) -> list[str]: return ["-i", f"{self.data_path}/scatteringOnly.inp"] @@ -39,4 +42,4 @@ def parse_output(self, stdout: str) -> float: if match: return float(match.group(1)) else: - raise ValueError("Failed to parse benchmark output.") + raise ValueError("{self.__class__.__name__}: Failed to parse benchmark output.") diff --git a/scripts/benchmarks/benches/result.py b/scripts/benchmarks/benches/result.py index 8dd2f4ba9c..6fc7e16095 100644 --- a/scripts/benchmarks/benches/result.py +++ b/scripts/benchmarks/benches/result.py @@ -14,5 +14,7 @@ class Result: command: str env: str stdout: str + passed: bool = True unit: str = "" name: str = "" + lower_is_better: bool = True diff --git a/scripts/benchmarks/benches/syclbench.py b/scripts/benchmarks/benches/syclbench.py new file mode 100644 index 0000000000..b9d6e50623 --- /dev/null +++ b/scripts/benchmarks/benches/syclbench.py @@ -0,0 +1,420 @@ +# Copyright (C) 2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import os +import csv +import io +from utils.utils import run, git_clone, create_build_path +from .base import Benchmark +from .result import Result +from .options import options + +class SyclBench: + def __init__(self, directory): + self.directory = directory + self.built = False + self.setup() + return + + def setup(self): + if self.built: + return + + build_path = create_build_path(self.directory, 'sycl-bench-build') + repo_path = git_clone(self.directory, "sycl-bench-repo", "https://github.com/mateuszpn/sycl-bench.git", "1e6ab2cfd004a72c5336c26945965017e06eab71") + + configure_command = [ + "cmake", + f"-B {build_path}", + f"-S {repo_path}", + f"-DCMAKE_BUILD_TYPE=Release", + f"-DCMAKE_CXX_COMPILER={options.sycl}/bin/clang++", + f"-DCMAKE_C_COMPILER={options.sycl}/bin/clang", + f"-DSYCL_IMPL=dpcpp" + ] + + run(configure_command, add_sycl=True) + run(f"cmake --build {build_path} -j", add_sycl=True) + + self.built = True + +class SyclBenchmark(Benchmark): + def __init__(self, bench, name, test): + self.bench = bench + self.bench_name = name + self.test = test + self.done = False + super().__init__(bench.directory) + + def bin_args(self) -> list[str]: + return [] + + def extra_env_vars(self) -> dict: + return {} + + def unit(self): + return "ms" + + def setup(self): + self.bench.setup() + self.benchmark_bin = os.path.join(self.directory, 'sycl-bench-build', self.bench_name) + + def run(self, env_vars) -> list[Result]: + if self.done: + return + self.outputfile = os.path.join(self.bench.directory, self.test+".csv") + print(f"{self.__class__.__name__}: Results in {self.outputfile}") + command = [ + f"{self.benchmark_bin}", + f"--warmup-run", + f"--num-runs={options.iterations}", + f"--output={self.outputfile}" + ] + + command += self.bin_args() + env_vars.update(self.extra_env_vars()) + + # no output to stdout, all in outputfile + self.run_bench(command, env_vars) + + with open(self.outputfile, 'r') as f: + reader = csv.reader(f) + res_list = [] + for row in reader: + if not row[0].startswith('#'): + res_list.append( + Result(label=row[0], + value=float(row[12]) * 1000, # convert to ms + passed=(row[1]=="PASS"), + command=command, + env=env_vars, + stdout=row)) + self.done = True + return res_list + + def teardown(self): + print(f"Removing {self.outputfile}...") + os.remove(self.outputfile) + return + + def name(self): + return self.test + +# multi benchmarks +class Blocked_transform(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "blocked_transform", "BlockedTransform_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=2049", + f"--local=1024" + ] + +class DagTaskI(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "dag_task_throughput_independent", "IndependentDAGTaskThroughput_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=32768", + ] + +class DagTaskS(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "dag_task_throughput_sequential", "DAGTaskThroughput_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=327680", + ] + +class HostDevBandwidth(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "host_device_bandwidth", "HostDeviceBandwidth_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=512", + ] + +class LocalMem(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "local_mem", f"LocalMem_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=10240000", + ] + +class Pattern_L2(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "pattern_L2", "L2_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=1024000000", + ] + +class Reduction(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "reduction", "Pattern_Reduction_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=10240000", + ] + +class ScalarProd(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "scalar_prod", "ScalarProduct_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=102400000", + ] + +class SegmentReduction(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "segmentedreduction", "Pattern_SegmentedReduction_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=102400000", + ] + +class UsmAccLatency(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "usm_accessors_latency", "USM_Latency_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=4096", + ] + +class UsmAllocLatency(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "usm_allocation_latency", "USM_Allocation_latency_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=1024000000", + ] + +class UsmInstrMix(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "usm_instr_mix", "USM_Instr_Mix_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=8192", + ] + +class UsmPinnedOverhead(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "usm_pinned_overhead", "USM_Pinned_Overhead_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=10240000", + ] + +class VecAdd(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "vec_add", "VectorAddition_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=102400000", + ] + +# single benchmarks +class Arith(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "arith", "Arith_int32_512") + + def bin_args(self) -> list[str]: + return [ + f"--size=16384", + ] + +class TwoDConvolution(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "2DConvolution", "2DConvolution") + +class Two_mm(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "2mm", "2mm") + + def bin_args(self) -> list[str]: + return [ + f"--size=512", + ] + +class Three_mm(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "3mm", "3mm") + + def bin_args(self) -> list[str]: + return [ + f"--size=512", + ] + +class Atax(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "atax", "Atax") + + def bin_args(self) -> list[str]: + return [ + f"--size=8192", + ] + +class Atomic_reduction(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "atomic_reduction", "ReductionAtomic_fp64") + +class Bicg(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "bicg", "Bicg") + + def bin_args(self) -> list[str]: + return [ + f"--size=204800", + ] + +class Correlation(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "correlation", "Correlation") + + def bin_args(self) -> list[str]: + return [ + f"--size=2048", + ] + +class Covariance(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "covariance", "Covariance") + + def bin_args(self) -> list[str]: + return [ + f"--size=2048", + ] + +class Gemm(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "gemm", "Gemm") + + def bin_args(self) -> list[str]: + return [ + f"--size=1536", + ] + +class Gesumv(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "gesummv", "Gesummv") + + def bin_args(self) -> list[str]: + return [ + f"--size=8192", + ] + +class Gramschmidt(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "gramschmidt", "Gramschmidt") + + def bin_args(self) -> list[str]: + return [ + f"--size=512", + ] + +class KMeans(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "kmeans", "Kmeans") + + def bin_args(self) -> list[str]: + return [ + f"--size=700000000", + ] + +class LinRegCoeff(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "lin_reg_coeff", "LinearRegressionCoeff") + + def bin_args(self) -> list[str]: + return [ + f"--size=1638400000", + ] + +class LinRegError(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "lin_reg_error", "LinearRegression") + + def bin_args(self) -> list[str]: + return [ + f"--size=4096", + ] + +class MatmulChain(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "matmulchain", "MatmulChain") + + def bin_args(self) -> list[str]: + return [ + f"--size=2048", + ] + +class MolDyn(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "mol_dyn", "MolecularDynamics") + + def bin_args(self) -> list[str]: + return [ + f"--size=8196", + ] + +class Mvt(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "mvt", "Mvt") + + def bin_args(self) -> list[str]: + return [ + f"--size=32767", + ] + +class NBody(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "nbody", "NBody_") + + def bin_args(self) -> list[str]: + return [ + f"--size=81920", + ] + +class Sf(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "sf", "sf_16") + + def bin_args(self) -> list[str]: + return [ + f"--size=5000000000", + ] + +class Syr2k(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "syr2k", "Syr2k") + + def bin_args(self) -> list[str]: + return [ + f"--size=2048", + ] + +class Syrk(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "syrk", "Syrk") + + def bin_args(self) -> list[str]: + return [ + f"--size=1024", + ] diff --git a/scripts/benchmarks/benches/velocity.py b/scripts/benchmarks/benches/velocity.py index fec3abb842..3c903bf11b 100644 --- a/scripts/benchmarks/benches/velocity.py +++ b/scripts/benchmarks/benches/velocity.py @@ -6,15 +6,14 @@ from utils.utils import git_clone from .base import Benchmark from .result import Result -from utils.utils import run +from utils.utils import run, create_build_path +from .options import options import os -import re class VelocityBench: def __init__(self, directory): self.directory = directory - # TODO: replace with https://github.com/oneapi-src/Velocity-Bench once all fixes land upstream - self.repo_path = git_clone(self.directory, "velocity-bench-repo", "https://github.com/pbalcer/Velocity-Bench.git", "ae0ae05c7fd1469779ecea4f36e4741b1d956eb4") + self.repo_path = git_clone(self.directory, "velocity-bench-repo", "https://github.com/oneapi-src/Velocity-Bench/", "b22215c16f789100449c34bf4eaa3fb178983d69") class VelocityBase(Benchmark): def __init__(self, name: str, bin_name: str, vb: VelocityBench): @@ -24,8 +23,14 @@ def __init__(self, name: str, bin_name: str, vb: VelocityBench): self.bin_name = bin_name self.code_path = os.path.join(self.vb.repo_path, self.bench_name, 'SYCL') + def download_deps(self): + return + def setup(self): - build_path = self.create_build_path(self.bench_name) + self.download_deps() + self.benchmark_bin = os.path.join(self.directory, self.bench_name, self.bin_name) + + build_path = create_build_path(self.directory, self.bench_name) configure_command = [ "cmake", @@ -36,8 +41,6 @@ def setup(self): run(configure_command, {'CC': 'clang', 'CXX':'clang++'}, add_sycl=True) run(f"cmake --build {build_path} -j", add_sycl=True) - self.benchmark_bin = os.path.join(build_path, self.bin_name) - def bin_args(self) -> list[str]: return [] @@ -57,7 +60,7 @@ def run(self, env_vars) -> list[Result]: result = self.run_bench(command, env_vars) - return [Result(label=self.bench_name, value=self.parse_output(result), command=command, env=env_vars, stdout=result)] + return [ Result(label=self.name(), value=self.parse_output(result), command=command, env=env_vars, stdout=result, lower_is_better=self.lower_is_better()) ] def teardown(self): return diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py index 5dad40c7fe..dac43643fc 100755 --- a/scripts/benchmarks/main.py +++ b/scripts/benchmarks/main.py @@ -5,9 +5,8 @@ # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import os from utils.utils import prepare_workdir, load_benchmark_results, save_benchmark_results; -from benches.api_overhead import APIOverheadSYCL +from benches.compute import * from benches.hashtable import Hashtable from benches.bitcracker import Bitcracker from benches.cudaSift import CudaSift @@ -15,53 +14,148 @@ from benches.quicksilver import QuickSilver from benches.SobelFilter import SobelFilter from benches.velocity import VelocityBench +from benches.syclbench import * from benches.options import options from output import generate_markdown import argparse +import re +import subprocess # Update this if you are changing the layout of the results files -INTERNAL_WORKDIR_VERSION = '1.0' - -def main(directory, additional_env_vars, save_name, compare_names): - variants = [ - ({'UR_L0_USE_IMMEDIATE_COMMANDLISTS': '0'}, "Imm-CmdLists-OFF"), - ({'UR_L0_USE_IMMEDIATE_COMMANDLISTS': '1'}, ""), - ] +INTERNAL_WORKDIR_VERSION = '1.7' +def main(directory, additional_env_vars, save_name, compare_names, filter): prepare_workdir(directory, INTERNAL_WORKDIR_VERSION) + cb = ComputeBench(directory) vb = VelocityBench(directory) + sb = SyclBench(directory) benchmarks = [ - APIOverheadSYCL(directory), + # *** Compute benchmarks + SubmitKernelSYCL(cb, 0), + SubmitKernelSYCL(cb, 1), + SubmitKernelUR(cb, 0), + SubmitKernelUR(cb, 1), + QueueInOrderMemcpy(cb, 0, 'Device', 'Device', 1024), + QueueInOrderMemcpy(cb, 0, 'Host', 'Device', 1024), + QueueMemcpy(cb, 'Device', 'Device', 1024), + StreamMemory(cb, 'Triad', 10 * 1024, 'Device'), + ExecImmediateCopyQueue(cb, 0, 1, 'Device', 'Device', 1024), + ExecImmediateCopyQueue(cb, 1, 1, 'Device', 'Host', 1024), + VectorSum(cb), + + # *** Velocity benchmarks Hashtable(vb), Bitcracker(vb), - #CudaSift(vb), TODO: the benchmark is passing, but is outputting "Failed to allocate device data" + CudaSift(vb), Easywave(vb), QuickSilver(vb), - SobelFilter(vb) + SobelFilter(vb), + + # *** sycl-bench multi benchmarks + # Blocked_transform(sb), # run time < 1ms + DagTaskI(sb), + DagTaskS(sb), + HostDevBandwidth(sb), + LocalMem(sb), + Pattern_L2(sb), + Reduction(sb), + ScalarProd(sb), + SegmentReduction(sb), + UsmAccLatency(sb), + UsmAllocLatency(sb), + UsmInstrMix(sb), + UsmPinnedOverhead(sb), + VecAdd(sb), + + # *** sycl-bench single benchmarks + # TwoDConvolution(sb), # run time < 1ms + Two_mm(sb), + Three_mm(sb), + # Arith(sb), # run time < 1ms + Atax(sb), + # Atomic_reduction(sb), # run time < 1ms + Bicg(sb), + Correlation(sb), + Covariance(sb), + Gemm(sb), + Gesumv(sb), + Gramschmidt(sb), + KMeans(sb), + LinRegCoeff(sb), + # LinRegError(sb), # run time < 1ms + MatmulChain(sb), + MolDyn(sb), + Mvt(sb), + Sf(sb), + Syr2k(sb), + Syrk(sb), ] + if filter: + benchmarks = [benchmark for benchmark in benchmarks if filter.search(benchmark.name())] + for benchmark in benchmarks: - benchmark.setup() + try: + print(f"setting up {benchmark.name()}... ", end='', flush=True) + benchmark.setup() + print("complete.") + + except Exception as e: + if options.exit_on_failure: + raise e + else: + print(f"failed: {e}") results = [] for benchmark in benchmarks: - for env_vars, extra_label in variants: - merged_env_vars = {**env_vars, **additional_env_vars} - bench_results = benchmark.run(merged_env_vars) - for res in bench_results: - res.unit = benchmark.unit() - res.name = benchmark.name() - res.label += f" {extra_label}" - results.append(res) + try: + merged_env_vars = {**additional_env_vars} + iteration_results = [] + for iter in range(options.iterations): + print(f"running {benchmark.name()}, iteration {iter}... ", end='', flush=True) + bench_results = benchmark.run(merged_env_vars) + if bench_results is not None: + for bench_result in bench_results: + if bench_result.passed: + print(f"complete ({bench_result.label}: {bench_result.value:.3f} {benchmark.unit()}).") + else: + print(f"complete ({bench_result.label}: verification FAILED)") + iteration_results.append(bench_result) + else: + print(f"did not finish (OK for sycl-bench).") + break; + + if len(iteration_results) == 0: + continue + + for label in set([result.label for result in iteration_results]): + label_results = [result for result in iteration_results if result.label == label and result.passed == True] + if len(label_results) > 0: + label_results.sort(key=lambda res: res.value) + median_index = len(label_results) // 2 + median_result = label_results[median_index] + + median_result.unit = benchmark.unit() + median_result.name = label + + results.append(median_result) + except Exception as e: + if options.exit_on_failure: + raise e + else: + print(f"failed: {e}") for benchmark in benchmarks: + print(f"tearing down {benchmark.name()}... ", end='', flush=True) benchmark.teardown() + print("complete.") chart_data = {"This PR" : results} for name in compare_names: + print(f"compare name: {name}") compare_result = load_benchmark_results(directory, name) if compare_result: chart_data[name] = compare_result @@ -74,7 +168,7 @@ def main(directory, additional_env_vars, save_name, compare_names): with open('benchmark_results.md', 'w') as file: file.write(markdown_content) - print("Markdown with benchmark results has been written to benchmark_results.md") + print(f"Markdown with benchmark results has been written to {os.getcwd()}/benchmark_results.md") def validate_and_parse_env_args(env_args): env_vars = {} @@ -89,15 +183,32 @@ def validate_and_parse_env_args(env_args): parser = argparse.ArgumentParser(description='Unified Runtime Benchmark Runner') parser.add_argument('benchmark_directory', type=str, help='Working directory to setup benchmarks.') parser.add_argument('sycl', type=str, help='Root directory of the SYCL compiler.') + parser.add_argument('ur_dir', type=str, help='UR install prefix path') + parser.add_argument('ur_adapter_name', type=str, help='Options to build the Unified Runtime as part of the benchmark') parser.add_argument("--no-rebuild", help='Rebuild the benchmarks from scratch.', action="store_true") parser.add_argument("--env", type=str, help='Use env variable for a benchmark run.', action="append", default=[]) parser.add_argument("--save", type=str, help='Save the results for comparison under a specified name.') parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append", default=["baseline"]) + parser.add_argument("--iterations", type=int, help='Number of times to run each benchmark to select a median value.', default=5) + parser.add_argument("--timeout", type=int, help='Timeout for individual benchmarks in seconds.', default=600) + parser.add_argument("--filter", type=str, help='Regex pattern to filter benchmarks by name.', default=None) + parser.add_argument("--epsilon", type=float, help='Threshold to consider change of performance significant', default=0.005) + parser.add_argument("--verbose", help='Print output of all the commands.', action="store_true") + parser.add_argument("--exit_on_failure", help='Exit on first failure.', action="store_true") args = parser.parse_args() additional_env_vars = validate_and_parse_env_args(args.env) + options.verbose = args.verbose options.rebuild = not args.no_rebuild options.sycl = args.sycl + options.iterations = args.iterations + options.timeout = args.timeout + options.epsilon = args.epsilon + options.ur_dir = args.ur_dir + options.ur_adapter_name = args.ur_adapter_name + options.exit_on_failure = args.exit_on_failure + + benchmark_filter = re.compile(args.filter) if args.filter else None - main(args.benchmark_directory, additional_env_vars, args.save, args.compare) + main(args.benchmark_directory, additional_env_vars, args.save, args.compare, benchmark_filter) diff --git a/scripts/benchmarks/output.py b/scripts/benchmarks/output.py index 9cfee303b1..eec8957fe7 100644 --- a/scripts/benchmarks/output.py +++ b/scripts/benchmarks/output.py @@ -3,67 +3,33 @@ # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import collections +import collections, re from benches.base import Result +from benches.options import options +import math -# Function to generate the mermaid bar chart script -def generate_mermaid_script(chart_data: dict[str, list[Result]]): - benches = collections.defaultdict(list) - for (_, data) in chart_data.items(): - for res in data: - benches[res.name].append(res.label) - - mermaid_script = "" - - for (bname, labels) in benches.items(): - # remove duplicates - labels = list(dict.fromkeys(labels)) - mermaid_script += f""" -```mermaid ---- -config: - gantt: - rightPadding: 10 - leftPadding: 120 - sectionFontSize: 10 - numberSectionStyles: 2 ---- -gantt - title {bname} - todayMarker off - dateFormat X - axisFormat %s -""" - for label in labels: - nbars = 0 - print_label = label.replace(" ", "
") - mermaid_script += f""" - section {print_label} -""" - for (name, data) in chart_data.items(): - for res in data: - if bname == res.name and label == res.label: - nbars += 1 - mean = res.value - crit = "crit," if name == "This PR" else "" - mermaid_script += f""" - {name} ({mean} {res.unit}) : {crit} 0, {int(mean)} -""" - padding = 4 - nbars - if padding > 0: - for _ in range(padding): - mermaid_script += f""" - - : 0, 0 -""" - mermaid_script += f""" -``` -""" +class OutputLine: + def __init__(self, name): + self.label = name + self.diff = None + self.bars = None + self.row = "" + + def __str__(self): + return f"(Label:{self.label}, diff:{self.diff})" - return mermaid_script + def __repr__(self): + return self.__str__() # Function to generate the markdown collapsible sections for each variant def generate_markdown_details(results: list[Result]): markdown_sections = [] + + markdown_sections.append(f""" +
+Benchmark details - environment, command, output... +""") + for res in results: env_vars_str = '\n'.join(f"{key}={value}" for key, value in res.env.items()) markdown_sections.append(f""" @@ -79,48 +45,180 @@ def generate_markdown_details(results: list[Result]): #### Output: {res.stdout} +
+""") + markdown_sections.append(f""" """) return "\n".join(markdown_sections) -def generate_summary(chart_data: dict[str, list[Result]]) -> str: - # Calculate the mean value of "This PR" for each benchmark - this_pr_means = {} - for res in chart_data["This PR"]: - if res.name not in this_pr_means: - this_pr_means[res.name] = [] - this_pr_means[res.name].append(res.value) - for bname in this_pr_means: - this_pr_means[bname] = sum(this_pr_means[bname]) / len(this_pr_means[bname]) - - # Calculate the percentage for each entry relative to "This PR" - summary_data = {"This PR": 100} - for entry_name, results in chart_data.items(): - if entry_name == "This PR": - continue - entry_sum = 0 +def generate_summary_table_and_chart(chart_data: dict[str, list[Result]]): + summary_table = "| Benchmark | " + " | ".join(chart_data.keys()) + " | Relative perf | Change | - |\n" + summary_table += "|---" * (len(chart_data) + 4) + "|\n" + + # Collect all benchmarks and their results + benchmark_results = collections.defaultdict(dict) + for key, results in chart_data.items(): for res in results: - if res.name in this_pr_means: - percentage = (res.value / this_pr_means[res.name]) * 100 - entry_sum += percentage + benchmark_results[res.name][key] = res + + # Generate the table rows + output_detailed_list = [] + + + global_product = 1 + mean_cnt = 0 + improved = 0 + regressed = 0 + no_change = 0 + + for bname, results in benchmark_results.items(): + oln = OutputLine(bname) + oln.row = f"| {bname} |" + best_value = None + best_key = None + + # Determine the best value + for key, res in results.items(): + if best_value is None or (res.lower_is_better and res.value < best_value) or (not res.lower_is_better and res.value > best_value): + best_value = res.value + best_key = key + + # Generate the row with the best value highlighted + if options.verbose: print(f"Results: {results}") + for key in chart_data.keys(): + if key in results: + intv = results[key].value + if key == best_key: + oln.row += f" {intv:3f} {results[key].unit} |" # Highlight the best value + else: + oln.row += f" {intv:.3f} {results[key].unit} |" + else: + oln.row += " - |" + + if len(chart_data.keys()) == 2: + key0 = list(chart_data.keys())[0] + key1 = list(chart_data.keys())[1] + if (key0 in results) and (key1 in results): + v0 = results[key0].value + v1 = results[key1].value + diff = None + if v0 != 0 and results[key0].lower_is_better: + diff = v1/v0 + elif v1 != 0 and not results[key0].lower_is_better: + diff = v0/v1 + + if diff != None: + oln.row += f"{(diff * 100):.2f}%" + oln.diff = diff + + output_detailed_list.append(oln) + + + sorted_detailed_list = sorted(output_detailed_list, key=lambda x: (x.diff is not None, x.diff), reverse=True) + + diff_values = [oln.diff for oln in sorted_detailed_list if oln.diff is not None] + + if len(diff_values) > 0: + max_diff = max(max(diff_values) - 1, 1 - min(diff_values)) + + for oln in sorted_detailed_list: + if oln.diff != None: + oln.row += f" | {(oln.diff - 1)*100:.2f}%" + delta = oln.diff - 1 + oln.bars = round(10*(oln.diff - 1)/max_diff) + if oln.bars == 0 or abs(delta) < options.epsilon: + oln.row += " | . |" + elif oln.bars > 0: + oln.row += f" | {'+' * oln.bars} |" + else: + oln.row += f" | {'-' * (-oln.bars)} |" + + mean_cnt += 1 + if abs(delta) > options.epsilon: + if delta > 0: + improved+=1 + else: + regressed+=1 + else: + no_change+=1 + + global_product *= oln.diff + else: + oln.row += " | |" + + if options.verbose: print(oln.row) + summary_table += oln.row + "\n" + else: + for oln in sorted_detailed_list: + oln.row += " | |" + if options.verbose: print(oln.row) + summary_table += oln.row + "\n" + + + grouped_objects = collections.defaultdict(list) + + for oln in output_detailed_list: + s = oln.label + prefix = re.match(r'^[^_\s]+', s)[0] + grouped_objects[prefix].append(oln) + + grouped_objects = dict(grouped_objects) + + if mean_cnt > 0: + global_mean = global_product ** (1/mean_cnt) + summary_line = f"Total {mean_cnt} benchmarks in mean. " + summary_line += "\n" + f"Geomean {global_mean*100:.3f}%. \nImproved {improved} Regressed {regressed} (threshold {options.epsilon*100:.2f}%)" + else: + summary_line = f"No diffs to calculate performance change" + + if options.verbose: print(summary_line) + + + summary_table = "\n## Performance change in benchmark groups\n" + + for name, outgroup in grouped_objects.items(): + outgroup_s = sorted(outgroup, key=lambda x: (x.diff is not None, x.diff), reverse=True) + product = 1.0 + n = len(outgroup_s) + r = 0 + for oln in outgroup_s: + if oln.diff != None: + product *= oln.diff + r += 1 + if r > 0: + summary_table += f""" +
+ Relative perf in group {name} ({n}): {math.pow(product, 1/r)*100:.3f}% - entry_average = entry_sum / len(results) if results else 0 - summary_data[entry_name] = entry_average +""" + else: + summary_table += f""" +
+ Relative perf in group {name} ({n}): cannot calculate - markdown_table = "| Name | Result % |\n| --- | --- |\n" - for entry_name, percentage in summary_data.items(): - markdown_table += f"| {entry_name} | {percentage:.2f}% |\n" +""" + summary_table += "| Benchmark | " + " | ".join(chart_data.keys()) + " | Relative perf | Change | - |\n" + summary_table += "|---" * (len(chart_data) + 4) + "|\n" + + for oln in outgroup_s: + summary_table += f"{oln.row}\n" + + summary_table += f""" +
+ +""" - return markdown_table + return summary_line, summary_table def generate_markdown(chart_data: dict[str, list[Result]]): - mermaid_script = generate_mermaid_script(chart_data) + (summary_line, summary_table) = generate_summary_table_and_chart(chart_data) return f""" # Summary -{generate_summary(chart_data)} -# Benchmark Results -{mermaid_script} -## Details +{summary_line}\n +(result is better)\n +{summary_table} +# Details {generate_markdown_details(chart_data["This PR"])} """ diff --git a/scripts/benchmarks/utils/utils.py b/scripts/benchmarks/utils/utils.py index 9dc3f23a9b..586837fc6f 100644 --- a/scripts/benchmarks/utils/utils.py +++ b/scripts/benchmarks/utils/utils.py @@ -8,9 +8,6 @@ import shutil import subprocess # nosec B404 from pathlib import Path -from collections import defaultdict -import csv -import io from benches.result import Result from benches.options import options @@ -28,9 +25,12 @@ def run(command, env_vars={}, cwd=None, add_sycl=False): env['LD_LIBRARY_PATH'] = sycl_lib_path + os.pathsep + env.get('LD_LIBRARY_PATH', '') env.update(env_vars) - result = subprocess.run(command, cwd=cwd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) # nosec B603 - print(result.stdout.decode()) - print(result.stderr.decode()) + result = subprocess.run(command, cwd=cwd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, timeout=options.timeout) # nosec B603 + + if options.verbose: + print(result.stdout.decode()) + print(result.stderr.decode()) + return result except subprocess.CalledProcessError as e: print(e.stdout.decode()) @@ -42,6 +42,7 @@ def git_clone(dir, name, repo, commit): if os.path.isdir(repo_path) and os.path.isdir(os.path.join(repo_path, '.git')): run("git fetch", cwd=repo_path) + run("git reset --hard", cwd=repo_path) run(f"git checkout {commit}", cwd=repo_path) elif not os.path.exists(repo_path): run(f"git clone --recursive {repo} {repo_path}") @@ -70,7 +71,8 @@ def load_benchmark_results(dir, compare_name) -> list[Result]: return None def prepare_bench_cwd(dir): - options.benchmark_cwd = os.path.join(dir, 'bcwd') + # we need 2 deep to workaround a problem with a fixed relative path in cudaSift + options.benchmark_cwd = os.path.join(dir, 'bcwd', 'bcwd') if os.path.exists(options.benchmark_cwd): shutil.rmtree(options.benchmark_cwd) os.makedirs(options.benchmark_cwd) @@ -97,3 +99,13 @@ def prepare_workdir(dir, version): with open(version_file_path, 'w') as version_file: version_file.write(version) + +def create_build_path(directory, name): + build_path = os.path.join(directory, name) + + if options.rebuild and Path(build_path).exists(): + shutil.rmtree(build_path) + + Path(build_path).mkdir(parents=True, exist_ok=True) + + return build_path diff --git a/scripts/benchmarks/workflow.png b/scripts/benchmarks/workflow.png new file mode 100644 index 0000000000..1db06cad9d Binary files /dev/null and b/scripts/benchmarks/workflow.png differ diff --git a/scripts/check-hardening.sh b/scripts/check-hardening.sh new file mode 100755 index 0000000000..781651744f --- /dev/null +++ b/scripts/check-hardening.sh @@ -0,0 +1,42 @@ +#!/bin/sh +if [ -z $1 ]; then + echo "Usage: $0 builddir" >&2; + exit; +fi + +which hardening-check >> /dev/null; +if [ $? != "0" ]; then + echo "hardening-check not found - on Ubuntu it is from the 'devscripts' package." >&2; + exit; +fi + +RET=0; + +for file in $1/bin/*; do + case "$file" in + */urtrace) + # This is a python script + true;; + *) + hardening-check -q --nocfprotection --nofortify $file;; + esac + RET=$(($RET + $?)) +done; + +for file in $1/lib/*.so; do + case "$file" in + */libOpenCL*) + # This is not built as part of UR + true;; + */libzeCallMap.so | */libur_mock_headers.so) + # Only used in testing, and are too simple for many of the hardening flags to have an effect. + true;; + *) + hardening-check -q --nocfprotection --nofortify $file;; + esac + RET=$(($RET + $?)) +done; + +if [ $RET != "0" ]; then + exit 1; +fi diff --git a/scripts/core/CONTRIB.rst b/scripts/core/CONTRIB.rst index f78b6d23df..3ee07ba9a4 100644 --- a/scripts/core/CONTRIB.rst +++ b/scripts/core/CONTRIB.rst @@ -127,9 +127,9 @@ Adapter Change Process .. _intel/llvm: https://github.com/intel/llvm .. _UNIFIED_RUNTIME_REPO: - https://github.com/intel/llvm/blob/sycl/sycl/plugins/unified_runtime/CMakeLists.txt#L102 + https://github.com/intel/llvm/blob/sycl/sycl/cmake/modules/FetchUnifiedRuntime.cmake#L119 .. _UNIFIED_RUNTIME_TAG: - https://github.com/intel/llvm/blob/sycl/sycl/plugins/unified_runtime/CMakeLists.txt#L109 + https://github.com/intel/llvm/blob/sycl/sycl/cmake/modules/FetchUnifiedRuntime.cmake#L126 Build Environment ================= diff --git a/scripts/core/EXP-BINDLESS-IMAGES.rst b/scripts/core/EXP-BINDLESS-IMAGES.rst index a68cf91f39..c2b3d1114e 100644 --- a/scripts/core/EXP-BINDLESS-IMAGES.rst +++ b/scripts/core/EXP-BINDLESS-IMAGES.rst @@ -66,8 +66,8 @@ Enums ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * ${x}_structure_type_t ${X}_STRUCTURE_TYPE_EXP_SAMPLER_MIP_PROPERTIES - ${X}_STRUCTURE_TYPE_EXP_INTEROP_MEM_DESC - ${X}_STRUCTURE_TYPE_EXP_INTEROP_SEMAPHORE_DESC + ${X}_STRUCTURE_TYPE_EXP_EXTERNAL_MEM_DESC + ${X}_STRUCTURE_TYPE_EXP_EXTERNAL_SEMAPHORE_DESC ${X}_STRUCTURE_TYPE_EXP_FILE_DESCRIPTOR ${X}_STRUCTURE_TYPE_EXP_WIN32_HANDLE ${X}_STRUCTURE_TYPE_EXP_SAMPLER_ADDR_MODES @@ -86,8 +86,8 @@ Enums * ${X}_DEVICE_INFO_MIPMAP_ANISOTROPY_SUPPORT_EXP * ${X}_DEVICE_INFO_MIPMAP_MAX_ANISOTROPY_EXP * ${X}_DEVICE_INFO_MIPMAP_LEVEL_REFERENCE_SUPPORT_EXP - * ${X}_DEVICE_INFO_INTEROP_MEMORY_IMPORT_SUPPORT_EXP - * ${X}_DEVICE_INFO_INTEROP_SEMAPHORE_IMPORT_SUPPORT_EXP + * ${X}_DEVICE_INFO_EXTERNAL_MEMORY_IMPORT_SUPPORT_EXP + * ${X}_DEVICE_INFO_EXTERNAL_SEMAPHORE_IMPORT_SUPPORT_EXP * ${X}_DEVICE_INFO_CUBEMAP_SUPPORT_EXP * ${X}_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP @@ -101,8 +101,8 @@ Enums * ${X}_DEVICE_INFO_BINDLESS_SAMPLE_2D_USM_EXP * ${x}_command_t - * ${X}_COMMAND_INTEROP_SEMAPHORE_WAIT_EXP - * ${X}_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP + * ${X}_COMMAND_EXTERNAL_SEMAPHORE_WAIT_EXP + * ${X}_COMMAND_EXTERNAL_SEMAPHORE_SIGNAL_EXP * ${x}_exp_image_copy_flags_t * ${X}_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE @@ -137,7 +137,7 @@ Enums * ${X}_FUNCTION_BINDLESS_IMAGES_MIPMAP_FREE_EXP * ${X}_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_MEMORY_EXP * ${X}_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_ARRAY_EXP - * ${X}_FUNCTION_BINDLESS_IMAGES_RELEASE_INTEROP_EXP + * ${X}_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP * ${X}_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_SEMAPHORE_EXP * ${X}_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_SEMAPHORE_EXP * ${X}_FUNCTION_BINDLESS_IMAGES_WAIT_EXTERNAL_SEMAPHORE_EXP @@ -151,10 +151,10 @@ Types * ${x}_exp_sampler_mip_properties_t * ${x}_exp_image_native_handle_t * ${x}_exp_image_mem_native_handle_t -* ${x}_exp_interop_mem_handle_t -* ${x}_exp_interop_semaphore_handle_t -* ${x}_exp_interop_mem_desc_t -* ${x}_exp_interop_semaphore_desc_t +* ${x}_exp_external_mem_handle_t +* ${x}_exp_external_semaphore_handle_t +* ${x}_exp_external_mem_desc_t +* ${x}_exp_external_semaphore_desc_t * ${x}_exp_file_descriptor_t * ${x}_exp_win32_handle_t * ${x}_exp_sampler_addr_modes_t @@ -181,7 +181,8 @@ Functions * Interop * ${x}BindlessImagesImportExternalMemoryExp * ${x}BindlessImagesMapExternalArrayExp - * ${x}BindlessImagesReleaseInteropExp + * ${x}BindlessImagesMapExternalLinearMemoryExp + * ${x}BindlessImagesReleaseExternalMemoryExp * ${x}BindlessImagesImportExternalSemaphoreExp * ${x}BindlessImagesReleaseExternalSemaphoreExp * ${x}BindlessImagesWaitExternalSemaphoreExp @@ -247,6 +248,11 @@ Changelog | 16.0 || Update device queries to resolve inconsistencies and | | || missing queries. | +----------+-------------------------------------------------------------+ +| 17.0 || Rename interop related structs and funcs with "external" | +| || keyword over "interop". | ++----------+-------------------------------------------------------------+ +| 18.0 | Added BindlessImagesMapExternalLinearMemoryExp function. | ++----------+-------------------------------------------------------------+ Contributors -------------------------------------------------------------------------------- diff --git a/scripts/core/EXP-COMMAND-BUFFER.rst b/scripts/core/EXP-COMMAND-BUFFER.rst index c23519cf67..d6ef76c7bc 100644 --- a/scripts/core/EXP-COMMAND-BUFFER.rst +++ b/scripts/core/EXP-COMMAND-BUFFER.rst @@ -78,6 +78,7 @@ Command-Buffers are tied to a specific ${x}_context_handle_t and ${x}_device_handle_t. ${x}CommandBufferCreateExp optionally takes a descriptor to provide additional properties for how the command-buffer should be constructed. The members defined in ${x}_exp_command_buffer_desc_t are: + * ``isUpdatable``, which should be set to ``true`` to support :ref:`updating command-buffer commands`. * ``isInOrder``, which should be set to ``true`` to enable commands enqueued to @@ -95,12 +96,13 @@ Commands can be appended to a command-buffer by calling any of the command-buffer append functions. Typically these closely mimic the existing enqueue functions in the Core API in terms of their command-specific parameters. However, they differ in that they take a command-buffer handle instead of a -queue handle, and the dependencies and return parameters are sync-points instead -of event handles. +queue handle. Dependencies are also expressed differently, in that internal +command-buffer dependencies are expressed with sync-points. While event handles +are used to express synchronization external to the command-buffer. -The entry-point for appending a kernel launch command also returns an optional -handle to the command being appended. This handle can be used to update the -command configuration between command-buffer executions, see the section on +The entry-points for appending commands also return an optional handle to the +command being appended. This handle can be used to update the command +configuration between command-buffer executions, see the section on :ref:`updating command-buffer commands`. Currently only the following commands are supported: @@ -122,7 +124,7 @@ It is planned to eventually support any command type from the Core API which can actually be appended to the equivalent adapter native constructs. Sync-Points --------------------------------------------------------------------------------- +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ A sync-point is a value which represents a command inside of a command-buffer which is returned from command-buffer append function calls. These can be @@ -138,14 +140,74 @@ were obtained from. ${x}_exp_command_buffer_sync_point_t syncPoint; ${x}CommandBufferAppendUSMMemcpyExp(hCommandBuffer, pDst, pSrc, size, 0, - nullptr, &syncPoint); + nullptr, 0, nullptr, &syncPoint, nullptr, + nullptr); // Append a kernel launch with syncPoint as a dependency, ignore returned // sync-point ${x}CommandBufferAppendKernelLaunchExp(hCommandBuffer, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, 1, &syncPoint, - nullptr, nullptr); + pLocalWorkSize, 0, nullptr, 1, + &syncPoint, 0, nullptr, + nullptr, nullptr, nullptr); + +Command Synchronization With Events +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +When appending commands to a command-buffer an optional ``phEventWaitList`` +input parameter is available for passing a list of ${x}_event_handle_t objects +the command should wait on. As well as an optional ``phEvent`` output parameter +to get a ${x}_event_handle_t object that will be signaled on completion of the +command execution. It is the users responsibility to release the returned +``phEvent`` with ${x}EventRelease. + +The wait event parameter allows commands in a command-buffer to depend on the +completion of UR commands which are external to a command-buffer. While the +output signal event parameter allows individual commands in a command-buffer to +trigger external queue commands. Using returned signal events as wait events +inside the same command-buffer is also valid usage. + +.. important:: + Support for using ``phEventWaitList`` & ``phEvent`` parameters requires a device + to support ${X}_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP. + +Signal Event Valid Usage +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A returned signal event represents only the status of the command in the +current execution of a given command-buffer on a device. Command signal events +are not unique per execution of a command-buffer. If a command-buffer is +enqueued multiple times before using one of these events (for example as a +dependency to an eager queue operation), it is undefined which specific +execution of the command-buffer the event will represent. If a dependency on a +specific graph command-buffer execution is required this ordering must be +enforced by the user to ensure there is only a single command-buffer execution +in flight when using these command signal events. + +When a user calls ${x}CommandBufferEnqueueExp all the signal events returned +from the individual commands in the command-buffer are synchronously reset to +a non-complete state prior to the asynchronous commands beginning. + +Inter-Graph Synchronization +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +It is possible for commands in different command-buffer objects to synchronize +using the event mechanism. This is only guaranteed to behave correctly in the one +directional synchronization case, where the signal events of one +command-buffer's commands are used as a wait events of another command-buffer's +commands. Such a relationship defines a permanent dependency between the +command-buffers which does not need to be updated using +:ref:`command event update` to preserve synchronization on future enqueues of +the command-buffer. + +Bi-directional sync between individual commands in two separate command-buffers +is however not guaranteed to behave correctly. This is due to the completion +state of the command events only being reset when a command-buffer is enqueued. +It is therefore possible for the first command-buffer enqueued to execute its +wait node that needs to have its event reset by the enqueue of the second +command-buffer, before the code path returns to user code for the user to +enqueue the second command-buffer. Resulting in the first command-buffer's +wait node completing too early for the intended overall executing ordering. Enqueueing Command-Buffers -------------------------------------------------------------------------------- @@ -162,18 +224,37 @@ enqueued or executed simultaneously, and submissions may be serialized. ${x}CommandBufferEnqueueExp(hCommandBuffer, hQueue, 0, nullptr, &executionEvent); + Updating Command-Buffer Commands -------------------------------------------------------------------------------- An adapter implementing the command-buffer experimental feature can optionally support updating the configuration of kernel commands recorded to a -command-buffer. Support for this is reported by returning true in the -${X}_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP query. +command-buffer. The attributes of kernel commands that can be updated are +device specific and can be queried using the +${X}_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP query. + +All update entry-points are synchronous and may block if the command-buffer is +executing when the entry-point is called. + +Kernel Argument Update +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Kernel commands can have the ND-Range & parameter arguments of the command +updated when a device supports the relevant bits in +${X}_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP. Updating kernel commands is done by passing the new kernel configuration to ${x}CommandBufferUpdateKernelLaunchExp along with the command handle of the kernel command to update. Configurations that can be changed are the -parameters to the kernel and the execution ND-Range. +kernel handle, the parameters to the kernel and the execution ND-Range. + +Kernel handles that might be used to update the kernel of a command, need +to be registered when the command is created. This can be done +using the ``phKernelAlternatives`` parameter of +${x}CommandBufferAppendKernelLaunchExp. The command can then be updated +to use the new kernel handle by passing it to +${x}CommandBufferUpdateKernelLaunchExp. .. parsed-literal:: @@ -187,11 +268,14 @@ parameters to the kernel and the execution ND-Range. ${x}CommandBufferCreateExp(hContext, hDevice, &desc, &hCommandBuffer); // Append a kernel command which has two buffer parameters, an input - // and an output. + // and an output. Register hNewKernel as an alternative kernel handle + // which can later be used to change the kernel handle associated + // with this command. ${x}_exp_command_buffer_command_handle_t hCommand; ${x}CommandBufferAppendKernelLaunchExp(hCommandBuffer, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, 0, nullptr, + pLocalWorkSize, 1, &hNewKernel, + 0, nullptr, 0, nullptr, nullptr, nullptr, &hCommand); // Close the command-buffer before updating @@ -220,6 +304,7 @@ parameters to the kernel and the execution ND-Range. ${x}_exp_command_buffer_update_kernel_launch_desc_t update { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + hNewKernel // hNewKernel 2, // numNewMemobjArgs 0, // numNewPointerArgs 0, // numNewValueArgs @@ -237,6 +322,76 @@ parameters to the kernel and the execution ND-Range. // Perform the update ${x}CommandBufferUpdateKernelLaunchExp(hCommand, &update); +Command Event Update +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Once a command-buffer has been finalized the wait-list parameter of the command +can be updated with ${x}CommandBufferUpdateWaitEventsExp. The number of wait +events for a command must stay consistent, therefore the number of events +passed to ${x}CommandBufferUpdateWaitEventsExp must be the same as when the +command was created. + +The ${x}CommandBufferUpdateSignalEventExp entry-points can be used to update +the signal event of a command. This returns a new event that will be signaled +on the next execution of the command in the command-buffer. It may be that +this is backed by the same native event object as the original signal event, +provided that the backend provides a way to reset or reuse events between +command-buffer executions. + +As ${x}_event_handle_t objects for queue submissions can only be signaled once, +and not reset, this update mechanism allows command synchronization to be +refreshed between command-buffer executions with regular command-queue events +that haven't yet been signaled. + +It is the users responsibility to release the returned ``phEvent`` with +${x}EventRelease. To update a command signal event with +${x}CommandBufferUpdateSignalEventExp there must also have been a non-null +``phEvent`` parameter passed on command creation. + +.. important:: + Support for updating ``phEventWaitList`` & ``phEvent`` parameters requires a + device to support the `EVENTS` bit in + ${X}_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP. + +.. parsed-literal:: + + // Create a command-buffer with update enabled. + ${x}_exp_command_buffer_desc_t desc { + ${X}_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, + nullptr, + true // isUpdatable + }; + ${x}_exp_command_buffer_handle_t hCommandBuffer; + ${x}CommandBufferCreateExp(hContext, hDevice, &desc, &hCommandBuffer); + + // Append a kernel command with 2 events to wait on, and returning an + // event that will be signaled. + ${x}_event_handle_t hSignalEvent; + ${x}_event_handle_t hWaitEvents[2] = {...}; + ${x}_exp_command_buffer_command_handle_t hCommand; + ${x}CommandBufferAppendKernelLaunchExp(hCommandBuffer, hKernel, workDim, + pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, 0, nullptr, 0, nullptr, + 2, hWaitEvents, nullptr, &hSignalEvent, + &hCommand); + + // Close the command-buffer before updating + ${x}CommandBufferFinalizeExp(hCommandBuffer); + + // Enqueue command-buffer + ${x}CommandBufferEnqueueExp(hCommandBuffer, hQueue, 0, nullptr, nullptr); + + // Wait for command-buffer to finish + ${x}QueueFinish(hQueue); + + // Update signal event + ${x}_event_handle_t hNewSignalEvent; + ${x}CommandBufferUpdateSignalEventExp(hCommand, &hNewSignalEvent); + + // Update wait events to a new event + ${x}_event_handle_t hNewWaitEvents[2] = ...; + {x}CommandBufferUpdateWaitEventsExp(hCommand, 2, &hNewWaitEvents); + API -------------------------------------------------------------------------------- @@ -249,7 +404,15 @@ Enums ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * ${x}_device_info_t * ${X}_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP - * ${X}_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP + * ${X}_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP + * ${X}_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP +* ${x}_device_command_buffer_update_capability_flags_t + * UPDATE_KERNEL_ARGUMENTS + * LOCAL_WORK_SIZE + * GLOBAL_WORK_SIZE + * GLOBAL_WORK_OFFSET + * KERNEL_HANDLE + * EVENTS * ${x}_result_t * ${X}_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP * ${X}_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP @@ -284,6 +447,7 @@ Enums * ${X}_FUNCTION_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_EXP * ${x}_exp_command_buffer_info_t * ${X}_EXP_COMMAND_BUFFER_INFO_REFERENCE_COUNT + * ${X}_EXP_COMMAND_BUFFER_INFO_DESCRIPTOR * ${x}_exp_command_buffer_command_info_t * ${X}_EXP_COMMAND_BUFFER_COMMAND_INFO_REFERENCE_COUNT @@ -320,6 +484,8 @@ Functions * ${x}CommandBufferRetainCommandExp * ${x}CommandBufferReleaseCommandExp * ${x}CommandBufferUpdateKernelLaunchExp +* ${x}CommandBufferUpdateSignalEventExp +* ${x}CommandBufferUpdateWaitEventsExp * ${x}CommandBufferGetInfoExp * ${x}CommandBufferCommandGetInfoExp @@ -340,6 +506,10 @@ Changelog +-----------+-------------------------------------------------------+ | 1.4 | Add function definitions for kernel command update | +-----------+-------------------------------------------------------+ +| 1.5 | Add support for updating kernel handles. | ++-----------+-------------------------------------------------------+ +| 1.6 | Command level synchronization with event objects | ++-----------+-------------------------------------------------------+ Contributors -------------------------------------------------------------------------------- @@ -348,3 +518,4 @@ Contributors * Ewan Crawford `ewan@codeplay.com `_ * Maxime France-Pillois `maxime.francepillois@codeplay.com `_ * Aaron Greig `aaron.greig@codeplay.com `_ +* Fábio Mestre `fabio.mestre@codeplay.com `_ diff --git a/scripts/core/EXP-NATIVE-ENQUEUE.rst b/scripts/core/EXP-NATIVE-ENQUEUE.rst index 152bf05956..876e4e136d 100644 --- a/scripts/core/EXP-NATIVE-ENQUEUE.rst +++ b/scripts/core/EXP-NATIVE-ENQUEUE.rst @@ -32,13 +32,13 @@ within the native API through the function passed to ${x}EnqueueNativeCommandExp, the function argument must only use the native queue accessed through ${x}QueueGetNativeHandle. Use of a native queue that is not the native queue returned by ${x}QueueGetNativeHandle results in undefined -behaviour. +behavior. -Any args that are needed by the func must be passed through a void* and unpacked -within the func. If ${x}_mem_handle_t arguments are to be used within -pfnNativeEnqueue, they must be accessed using ${x}MemGetNativeHandle. -${x}_mem_handle_t arguments must be packed in the void* argument that will be -used in pfnNativeEnqueue, as well as ${x}EnqueueNativeCommandExp's phMemList +Any args that are needed by the func must be passed through a ``void*`` and unpacked +within the func. If ``${x}_mem_handle_t`` arguments are to be used within +``pfnNativeEnqueue``, they must be accessed using ${x}MemGetNativeHandle. +``${x}_mem_handle_t`` arguments must be packed in the void* argument that will be +used in ``pfnNativeEnqueue``, as well as ${x}EnqueueNativeCommandExp's ``phMemList`` argument. API @@ -65,20 +65,20 @@ Functions Changelog -------------------------------------------------------------------------------- -+-----------+-------------------------+ -| Revision | Changes | -+===========+=========================+ -| 1.0 | Initial Draft | -+-----------+-------------------------+ -| 1.1 | Make `phEvent` optional | -+-----------+-------------------------+ ++-----------+---------------------------+ +| Revision | Changes | ++===========+===========================+ +| 1.0 | Initial Draft | ++-----------+---------------------------+ +| 1.1 | Make ``phEvent`` optional | ++-----------+---------------------------+ Support -------------------------------------------------------------------------------- Adapters which support this experimental feature *must* return true for the new -`${X}_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP` device info query. +``${X}_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP`` device info query. Contributors @@ -86,3 +86,4 @@ Contributors * Hugh Delaney `hugh.delaney@codeplay.com `_ * Kenneth Benzie (Benie) `k.benzie@codeplay.com `_ +* Ewan Crawford `ewan@codeplay.com `_ diff --git a/scripts/core/INTRO.rst b/scripts/core/INTRO.rst index 1f319d6884..9a46d12170 100644 --- a/scripts/core/INTRO.rst +++ b/scripts/core/INTRO.rst @@ -161,7 +161,7 @@ Tracing Unified Runtime loader implements tracing support through the `XPTI framework `__. -.. list-table:: UR Stream `"ur"` Notification Signatures +.. list-table:: UR Stream `"ur.call"` Notification Signatures :header-rows: 1 * - Trace Point Type @@ -286,7 +286,7 @@ for these parameter structs can be found in the main API header. Layers --------------------- UR comes with a mechanism that allows various API intercept layers to be enabled, either through the API or with an environment variable (see `Environment Variables`_). -Layers currently included with the runtime are as follows: +By default, no layers are enabled. Layers currently included with the runtime are as follows: .. list-table:: :header-rows: 1 @@ -295,6 +295,8 @@ Layers currently included with the runtime are as follows: - Description * - UR_LAYER_PARAMETER_VALIDATION - Enables non-adapter-specific parameter validation (e.g. checking for null values). + * - UR_LAYER_BOUNDS_CHECKING + - Enables non-adapter-specific bounds checking of USM allocations for enqueued commands. Automatically enables UR_LAYER_PARAMETER_VALIDATION. * - UR_LAYER_LEAK_CHECKING - Performs some leak checking for API calls involving object creation/destruction. * - UR_LAYER_LIFETIME_VALIDATION @@ -394,6 +396,40 @@ Specific environment variables can be set to control the behavior of unified run See the Layers_ section for details of the layers currently included in the runtime. +.. envvar:: UR_LOADER_PRELOAD_FILTER + + If set, the loader will read `ONEAPI_DEVICE_SELECTOR` before loading the UR Adapters to determine which backends should be loaded. + + .. note:: + + This environment variable is default enabled on Linux, but default disabled on Windows. + +CTS Environment Variables +------------------------- + +The following environment variables are used by the CTS runner and can be used to specify the platform that the test +framework should run on. This can be used during development and testing to run CTS tests in case multiple platforms +are available. If both filters are specified, then they both must match a platform for it to be selected. If there are +no valid platforms, then the tests will fail. Command line arguments take priority over these variables. + +.. envvar:: UR_CTS_ADAPTER_PLATFORM + + A specifier list in the form of `[(backend):](platform name)[;[(backend)]:(platform name)]...`. If a backend + specific specifier is present in the list and the test is running for that backend, the device with the given name + is chosen. Otherwise, it must match the name of the specifier from the list with no backend. Backend names are case- + insensitive, however platform names must match exactly. + + For example, if the test device has multiple platforms and you want to run tests on the "ABC Corp" backend when + testing OpenCL and "XYZ Org" when testing level zero, you'd use `OPENCL:ABC Corp;LEVEL_ZERO:XYZ Org`. This form is + useful when running the `build` target with a build with multiple backends. + + For testing only one platform, the backend can be omitted. For example, just `ABC Corp` is sufficient if the tests + are only going to be testing OpenCL. + +.. envvar:: UR_CTS_BACKEND + + A (case insensitive) backend to force the test to use. For example, `opencl`, `level_zero`, `hip` and so on. + Service identifiers --------------------- diff --git a/scripts/core/adapter.yml b/scripts/core/adapter.yml index 958d135b78..a4eddd823c 100644 --- a/scripts/core/adapter.yml +++ b/scripts/core/adapter.yml @@ -28,20 +28,21 @@ params: - type: "uint32_t" name: NumEntries desc: | - [in] the number of adapters to be added to phAdapters. + [in] the number of adapters to be added to phAdapters. If phAdapters is not NULL, then NumEntries should be greater than zero, otherwise $X_RESULT_ERROR_INVALID_SIZE, will be returned. - type: "$x_adapter_handle_t*" name: phAdapters desc: | [out][optional][range(0, NumEntries)] array of handle of adapters. - If NumEntries is less than the number of adapters available, then $xAdapterGet shall only retrieve that number of platforms. + If NumEntries is less than the number of adapters available, then $xAdapterGet shall only retrieve that number of adapters. - type: "uint32_t*" name: "pNumAdapters" desc: | - [out][optional] returns the total number of adapters available. + [out][optional] returns the total number of adapters available. returns: - - $X_RESULT_ERROR_INVALID_SIZE + - $X_RESULT_ERROR_INVALID_SIZE: + - "`NumEntries == 0 && phAdapters != NULL`" --- #-------------------------------------------------------------------------- type: function desc: "Releases the adapter handle reference indicating end of its usage" diff --git a/scripts/core/common.yml b/scripts/core/common.yml index d06333eb07..73501ac39d 100644 --- a/scripts/core/common.yml +++ b/scripts/core/common.yml @@ -39,6 +39,12 @@ desc: "Microsoft-specific dllexport storage-class attribute" condition: "defined(_WIN32)" name: $X_APIEXPORT value: __declspec(dllexport) +--- #-------------------------------------------------------------------------- +type: macro +desc: "GCC-specific dllexport storage-class attribute" +condition: "__GNUC__ >= 4" +name: $X_APIEXPORT +value: __attribute__ ((visibility ("default"))) altvalue: "" --- #-------------------------------------------------------------------------- type: macro @@ -269,13 +275,15 @@ etors: desc: "Objection allocation failure" - name: ERROR_ADAPTER_SPECIFIC desc: "An adapter specific warning/error has been reported and can be retrieved - via the urPlatformGetLastError entry point." + via the urAdapterGetLastError entry point." - name: ERROR_LAYER_NOT_PRESENT desc: "A requested layer was not found by the loader." - name: ERROR_IN_EVENT_LIST_EXEC_STATUS desc: "An event in the provided wait list has $X_EVENT_STATUS_ERROR." - name: ERROR_DEVICE_NOT_AVAILABLE desc: "Device in question has `$X_DEVICE_INFO_AVAILABLE == false`" + - name: ERROR_INVALID_SPEC_ID + desc: "A specialization constant identifier is not valid." - name: ERROR_UNKNOWN value: "0x7ffffffe" desc: "Unknown or internal error" diff --git a/scripts/core/context.yml b/scripts/core/context.yml index 6568a485ae..e45c93010d 100644 --- a/scripts/core/context.yml +++ b/scripts/core/context.yml @@ -95,7 +95,7 @@ etors: - name: REFERENCE_COUNT desc: | [uint32_t] Reference count of the context object. - The reference count returned should be considered immediately stale. + The reference count returned should be considered immediately stale. It is unsuitable for general use in applications. This feature is provided for identifying memory leaks. - name: USM_MEMCPY2D_SUPPORT desc: "[$x_bool_t] to indicate if the $xEnqueueUSMMemcpy2D entrypoint is supported." @@ -107,11 +107,11 @@ etors: desc: "[$x_memory_scope_capability_flags_t] return a bit-field of atomic memory scope capabilities." - name: ATOMIC_FENCE_ORDER_CAPABILITIES desc: | - [$x_memory_order_capability_flags_t] return a bit-field of atomic memory fence order capabilities. + [$x_memory_order_capability_flags_t] return a bit-field of atomic memory fence order capabilities. Zero is returned if the backend does not support context-level fences. - name: ATOMIC_FENCE_SCOPE_CAPABILITIES desc: | - [$x_memory_scope_capability_flags_t] return a bit-field of atomic memory fence scope capabilities. + [$x_memory_scope_capability_flags_t] return a bit-field of atomic memory fence scope capabilities. Zero is returned if the backend does not support context-level fences. --- #-------------------------------------------------------------------------- type: function @@ -122,7 +122,7 @@ decl: static ordinal: "0" analogue: - "**clReleaseContext**" -details: +details: - "The application may call this function from simultaneous threads for the same context." - "The implementation of this function should be thread-safe." params: @@ -157,7 +157,7 @@ params: name: pPropValue desc: | [out][optional][typename(propName, propSize)] array of bytes holding the info. - if propSize is not equal to or greater than the real number of bytes needed to return + if propSize is not equal to or greater than the real number of bytes needed to return the info then the $X_RESULT_ERROR_INVALID_SIZE error is returned and pPropValue is not used. - type: "size_t*" name: pPropSizeRet diff --git a/scripts/core/device.yml b/scripts/core/device.yml index ead3ceeb8d..6641d8bb2b 100644 --- a/scripts/core/device.yml +++ b/scripts/core/device.yml @@ -15,8 +15,8 @@ ordinal: "2" --- #-------------------------------------------------------------------------- type: macro desc: | - Target identification strings for $x_device_binary_t.pDeviceTargetSpec - A device type represented by a particular target triple requires specific + Target identification strings for $x_device_binary_t.pDeviceTargetSpec + A device type represented by a particular target triple requires specific binary images. We need to map the image type onto the device target triple name: "$X_DEVICE_BINARY_TARGET_UNKNOWN" value: "\"\"" @@ -35,7 +35,7 @@ value: "\"spir64\"" --- #-------------------------------------------------------------------------- type: macro desc: | - Device-specific binary images produced from SPIR-V 64-bit <-> various + Device-specific binary images produced from SPIR-V 64-bit <-> various "spir64_*" triples for specific 64-bit OpenCL CPU devices name: "$X_DEVICE_BINARY_TARGET_SPIRV64_X86_64" value: "\"spir64_x86_64\"" @@ -75,7 +75,7 @@ members: name: pDeviceTargetSpec desc: | [in] null-terminated string representation of the device's target architecture. - For example: + For example: + $X_DEVICE_BINARY_TARGET_UNKNOWN + $X_DEVICE_BINARY_TARGET_SPIRV32 + $X_DEVICE_BINARY_TARGET_SPIRV64 @@ -326,7 +326,7 @@ etors: - name: REFERENCE_COUNT desc: | [uint32_t] Reference count of the device object. - The reference count returned should be considered immediately stale. + The reference count returned should be considered immediately stale. It is unsuitable for general use in applications. This feature is provided for identifying memory leaks. - name: IL_VERSION desc: "[char[]] IL version" @@ -356,7 +356,7 @@ etors: desc: "[uint32_t] maximum number of sub-devices when the device is partitioned" - name: PARTITION_AFFINITY_DOMAIN desc: | - [$x_device_affinity_domain_flags_t] Returns a bit-field of the supported affinity domains for partitioning. + [$x_device_affinity_domain_flags_t] Returns a bit-field of the supported affinity domains for partitioning. If the device does not support any affinity domains, then 0 will be returned. - name: PARTITION_TYPE desc: "[$x_device_partition_property_t[]] return an array of $x_device_partition_property_t for properties specified in $xDevicePartition" @@ -365,7 +365,7 @@ etors: - name: SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS desc: "[$x_bool_t] support sub group independent forward progress" - name: SUB_GROUP_SIZES_INTEL - desc: "[uint32_t[]] return an array of sub group sizes supported on Intel device" + desc: "[uint32_t[]] return an array of supported sub group sizes" - name: USM_HOST_SUPPORT desc: "[$x_device_usm_access_capability_flags_t] support USM host memory access" - name: USM_DEVICE_SUPPORT @@ -412,7 +412,7 @@ etors: desc: "[$x_bool_t] support for bfloat16" - name: MAX_COMPUTE_QUEUE_INDICES desc: | - [uint32_t] Returns 1 if the device doesn't have a notion of a + [uint32_t] Returns 1 if the device doesn't have a notion of a queue index. Otherwise, returns the number of queue indices that are available for this device. - name: KERNEL_SET_SPECIALIZATION_CONSTANTS @@ -548,10 +548,10 @@ etors: value: "$X_BIT(4)" - name: NEXT_PARTITIONABLE desc: | - Split the device along the next partitionable affinity domain. + Split the device along the next partitionable affinity domain. The implementation shall find the first level along which the device - or sub device may be further subdivided in the order: - $X_DEVICE_AFFINITY_DOMAIN_FLAG_NUMA, $X_DEVICE_AFFINITY_DOMAIN_FLAG_L4_CACHE, $X_DEVICE_AFFINITY_DOMAIN_FLAG_L3_CACHE, $X_DEVICE_AFFINITY_DOMAIN_FLAG_L2_CACHE, $X_DEVICE_AFFINITY_DOMAIN_FLAG_L1_CACHE, + or sub device may be further subdivided in the order: + $X_DEVICE_AFFINITY_DOMAIN_FLAG_NUMA, $X_DEVICE_AFFINITY_DOMAIN_FLAG_L4_CACHE, $X_DEVICE_AFFINITY_DOMAIN_FLAG_L3_CACHE, $X_DEVICE_AFFINITY_DOMAIN_FLAG_L2_CACHE, $X_DEVICE_AFFINITY_DOMAIN_FLAG_L1_CACHE, and partition the device into sub devices comprised of compute units that share memory subsystems at this level. value: "$X_BIT(5)" --- #-------------------------------------------------------------------------- @@ -679,7 +679,7 @@ params: - type: "uint32_t" name: NumBinaries desc: | - [in] the number of binaries passed in ppBinaries. + [in] the number of binaries passed in ppBinaries. Must greater than or equal to zero otherwise $X_RESULT_ERROR_INVALID_VALUE is returned. - type: "uint32_t*" name: pSelectedBinary @@ -820,9 +820,9 @@ params: - type: $x_native_handle_t name: hNativeDevice desc: "[in][nocheck] the native handle of the device." - - type: $x_platform_handle_t - name: hPlatform - desc: "[in] handle of the platform instance" + - type: $x_adapter_handle_t + name: hAdapter + desc: "[in] handle of the adapter to which `hNativeDevice` belongs" - type: const $x_device_native_properties_t* name: pProperties desc: "[in][optional] pointer to native device properties struct." @@ -850,13 +850,13 @@ params: desc: "[in] handle of the device instance" - type: "uint64_t*" name: pDeviceTimestamp - desc: | - [out][optional] pointer to the Device's global timestamp that + desc: | + [out][optional] pointer to the Device's global timestamp that correlates with the Host's global timestamp value - type: "uint64_t*" name: pHostTimestamp desc: | - [out][optional] pointer to the Host's global timestamp that + [out][optional] pointer to the Host's global timestamp that correlates with the Device's global timestamp value --- #-------------------------------------------------------------------------- type: enum diff --git a/scripts/core/enqueue.yml b/scripts/core/enqueue.yml index edf655318a..f712dd9021 100644 --- a/scripts/core/enqueue.yml +++ b/scripts/core/enqueue.yml @@ -33,13 +33,13 @@ params: name: pGlobalWorkOffset desc: "[in] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item" - type: "const size_t*" - name: pGlobalWorkSize + name: pGlobalWorkSize desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function" - type: "const size_t*" name: pLocalWorkSize desc: | [in][optional] pointer to an array of workDim unsigned values that specify the number of local work-items forming a work-group that will execute the kernel function. - If nullptr, the runtime implementation will choose the work-group size. + If nullptr, the runtime implementation will choose the work-group size. - type: uint32_t name: numEventsInWaitList desc: "[in] size of the event wait list" @@ -47,12 +47,12 @@ params: name: phEventWaitList desc: | [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. - If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. + If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular kernel execution instance. -returns: + [out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. +returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_KERNEL - $X_RESULT_ERROR_INVALID_EVENT @@ -71,7 +71,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- type: function -desc: "Enqueue a command which waits a list of events to complete before it completes" +desc: "Enqueue a command which waits a list of events to complete before it completes" class: $xEnqueue name: EventsWait ordinal: "0" @@ -92,12 +92,12 @@ params: desc: | [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before this command can be executed. If nullptr, the numEventsInWaitList must be 0, indicating that all previously enqueued commands - must be complete. + must be complete. - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command instance. -returns: + [out][optional] return an event object that identifies this particular command instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. +returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_EVENT - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: @@ -111,7 +111,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- type: function -desc: "Enqueue a barrier command which waits a list of events to complete before it completes" +desc: "Enqueue a barrier command which waits a list of events to complete before it completes" class: $xEnqueue name: EventsWaitWithBarrier ordinal: "0" @@ -133,12 +133,12 @@ params: desc: | [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before this command can be executed. If nullptr, the numEventsInWaitList must be 0, indicating that all previously enqueued commands - must be complete. + must be complete. - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command instance. -returns: + [out][optional] return an event object that identifies this particular command instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. +returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_EVENT - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: @@ -152,7 +152,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- type: function -desc: "Enqueue a command to read from a buffer object to host memory" +desc: "Enqueue a command to read from a buffer object to host memory" class: $xEnqueue name: MemBufferRead ordinal: "0" @@ -178,7 +178,7 @@ params: desc: "[in] size in bytes of data being read" - type: void* name: pDst - desc: "[in] pointer to host memory where data is to be read into" + desc: "[in] pointer to host memory where data is to be read into" - type: uint32_t name: numEventsInWaitList desc: "[in] size of the event wait list" @@ -190,8 +190,8 @@ params: - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command instance. -returns: + [out][optional] return an event object that identifies this particular command instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. +returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_EVENT - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: @@ -207,11 +207,11 @@ returns: - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- type: function -desc: "Enqueue a command to write into a buffer object from host memory" +desc: "Enqueue a command to write into a buffer object from host memory" class: $xEnqueue name: MemBufferWrite ordinal: "0" -details: +details: - "Input parameter blockingWrite indicates if the write is blocking or non-blocking." analogue: - "**clEnqueueWriteBuffer**" @@ -233,7 +233,7 @@ params: desc: "[in] size in bytes of data being written" - type: "const void*" name: pSrc - desc: "[in] pointer to host memory where data is to be written from" + desc: "[in] pointer to host memory where data is to be written from" - type: uint32_t name: numEventsInWaitList desc: "[in] size of the event wait list" @@ -245,8 +245,8 @@ params: - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command instance. -returns: + [out][optional] return an event object that identifies this particular command instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. +returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_EVENT - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: @@ -262,7 +262,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- type: function -desc: "Enqueue a command to read a 2D or 3D rectangular region from a buffer object to host memory" +desc: "Enqueue a command to read a 2D or 3D rectangular region from a buffer object to host memory" class: $xEnqueue name: MemBufferReadRect ordinal: "0" @@ -301,10 +301,10 @@ params: desc: "[in] length of each row in bytes in the host memory region pointed by dst" - type: size_t name: hostSlicePitch - desc: "[in] length of each 2D slice in bytes in the host memory region pointed by dst" + desc: "[in] length of each 2D slice in bytes in the host memory region pointed by dst" - type: void* name: pDst - desc: "[in] pointer to host memory where data is to be read into" + desc: "[in] pointer to host memory where data is to be read into" - type: uint32_t name: numEventsInWaitList desc: "[in] size of the event wait list" @@ -316,8 +316,8 @@ params: - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command instance. -returns: + [out][optional] return an event object that identifies this particular command instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. +returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_EVENT - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: @@ -340,7 +340,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- type: function -desc: "Enqueue a command to write a 2D or 3D rectangular region in a buffer object from host memory" +desc: "Enqueue a command to write a 2D or 3D rectangular region in a buffer object from host memory" class: $xEnqueue name: MemBufferWriteRect ordinal: "0" @@ -379,10 +379,10 @@ params: desc: "[in] length of each row in bytes in the host memory region pointed by src" - type: size_t name: hostSlicePitch - desc: "[in] length of each 2D slice in bytes in the host memory region pointed by src" + desc: "[in] length of each 2D slice in bytes in the host memory region pointed by src" - type: void* name: pSrc - desc: "[in] pointer to host memory where data is to be written from" + desc: "[in] pointer to host memory where data is to be written from" - type: uint32_t name: numEventsInWaitList desc: "[in] size of the event wait list" @@ -394,8 +394,8 @@ params: - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command instance. -returns: + [out][optional] return an event object that identifies this particular command instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. +returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_EVENT - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: @@ -418,7 +418,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- type: function -desc: "Enqueue a command to copy from a buffer object to another" +desc: "Enqueue a command to copy from a buffer object to another" class: $xEnqueue name: MemBufferCopy ordinal: "0" @@ -437,7 +437,7 @@ params: - type: size_t name: srcOffset desc: "[in] offset into hBufferSrc to begin copying from" - - type: size_t + - type: size_t name: dstOffset desc: "[in] offset info hBufferDst to begin copying into" - type: size_t @@ -454,8 +454,8 @@ params: - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command instance. -returns: + [out][optional] return an event object that identifies this particular command instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. +returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_EVENT - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: @@ -472,7 +472,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- type: function -desc: "Enqueue a command to copy a 2D or 3D rectangular region from one buffer object to another" +desc: "Enqueue a command to copy a 2D or 3D rectangular region from one buffer object to another" class: $xEnqueue name: MemBufferCopyRect ordinal: "0" @@ -508,7 +508,7 @@ params: desc: "[in] length of each row in bytes in the destination buffer object" - type: size_t name: dstSlicePitch - desc: "[in] length of each 2D slice in bytes in the destination buffer object" + desc: "[in] length of each 2D slice in bytes in the destination buffer object" - type: uint32_t name: numEventsInWaitList desc: "[in] size of the event wait list" @@ -520,8 +520,8 @@ params: - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command instance. -returns: + [out][optional] return an event object that identifies this particular command instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. +returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_EVENT - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: @@ -545,7 +545,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- type: function -desc: "Enqueue a command to fill a buffer object with a pattern of a given size" +desc: "Enqueue a command to fill a buffer object with a pattern of a given size" class: $xEnqueue name: MemBufferFill ordinal: "0" @@ -581,8 +581,8 @@ params: - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command instance. -returns: + [out][optional] return an event object that identifies this particular command instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. +returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_EVENT - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: @@ -603,7 +603,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- type: function -desc: "Enqueue a command to read from an image or image array object to host memory" +desc: "Enqueue a command to read from an image or image array object to host memory" class: $xEnqueue name: MemImageRead ordinal: "0" @@ -635,7 +635,7 @@ params: desc: "[in] length of each 2D slice of the 3D image" - type: void* name: pDst - desc: "[in] pointer to host memory where image is to be read into" + desc: "[in] pointer to host memory where image is to be read into" - type: uint32_t name: numEventsInWaitList desc: "[in] size of the event wait list" @@ -647,8 +647,8 @@ params: - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command instance. -returns: + [out][optional] return an event object that identifies this particular command instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. +returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_EVENT - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: @@ -664,7 +664,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- type: function -desc: "Enqueue a command to write an image or image array object from host memory" +desc: "Enqueue a command to write an image or image array object from host memory" class: $xEnqueue name: MemImageWrite ordinal: "0" @@ -696,7 +696,7 @@ params: desc: "[in] length of each 2D slice of the 3D image" - type: void* name: pSrc - desc: "[in] pointer to host memory where image is to be read into" + desc: "[in] pointer to host memory where image is to be read into" - type: uint32_t name: numEventsInWaitList desc: "[in] size of the event wait list" @@ -708,8 +708,8 @@ params: - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command instance. -returns: + [out][optional] return an event object that identifies this particular command instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. +returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_EVENT - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: @@ -725,7 +725,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- type: function -desc: "Enqueue a command to copy from an image object to another" +desc: "Enqueue a command to copy from an image object to another" class: $xEnqueue name: MemImageCopy ordinal: "0" @@ -761,8 +761,8 @@ params: - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command instance. -returns: + [out][optional] return an event object that identifies this particular command instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. +returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_EVENT - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: @@ -778,7 +778,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- type: function -desc: "Enqueue a command to fill an image object with specified color" +desc: "Enqueue a command to fill an image object with specified color" class: $xEnqueue name: MemImageFill ordinal: "0" @@ -817,8 +817,8 @@ params: - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command instance. -returns: + [out][optional] return an event object that identifies this particular command instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. +returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_EVENT - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: @@ -839,10 +839,10 @@ etors: - name: READ desc: "Map for read access" value: "$X_BIT(0)" - - name: WRITE + - name: WRITE desc: "Map for write access" value: "$X_BIT(1)" - - name: WRITE_INVALIDATE_REGION + - name: WRITE_INVALIDATE_REGION desc: "Map for discard_write access" value: "$X_BIT(2)" --- #-------------------------------------------------------------------------- @@ -856,11 +856,11 @@ etors: value: "$X_BIT(0)" --- #-------------------------------------------------------------------------- type: function -desc: "Enqueue a command to map a region of the buffer object into the host address space and return a pointer to the mapped region" +desc: "Enqueue a command to map a region of the buffer object into the host address space and return a pointer to the mapped region" class: $xEnqueue name: MemBufferMap ordinal: "0" -details: +details: - "Input parameter blockingMap indicates if the map is blocking or non-blocking." - "Currently, no direct support in Level Zero. Implemented as a shared allocation followed by copying on discrete GPU" - "TODO: add a driver function in Level Zero?" @@ -896,11 +896,11 @@ params: - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command instance. + [out][optional] return an event object that identifies this particular command instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. - type: void** name: ppRetMap desc: "[out] return mapped pointer. TODO: move it before numEventsInWaitList?" -returns: +returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_EVENT - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: @@ -916,7 +916,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- type: function -desc: "Enqueue a command to map a region of the image object into the host address space and return a pointer to the mapped region" +desc: "Enqueue a command to map a region of the image object into the host address space and return a pointer to the mapped region" class: $xEnqueue name: MemImageMap version: "9999.0" # See #50 @@ -966,8 +966,8 @@ params: - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command instance. -returns: + [out][optional] return an event object that identifies this particular command instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. +returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_EVENT - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: @@ -981,7 +981,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- type: function -desc: "Enqueue a command to unmap a previously mapped region of a memory object" +desc: "Enqueue a command to unmap a previously mapped region of a memory object" class: $xEnqueue name: MemUnmap ordinal: "0" @@ -1008,8 +1008,8 @@ params: - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command instance. -returns: + [out][optional] return an event object that identifies this particular command instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. +returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_EVENT - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: @@ -1054,7 +1054,7 @@ params: - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command instance. + [out][optional] return an event object that identifies this particular command instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_EVENT @@ -1074,7 +1074,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- type: function -desc: "Enqueue a command to copy USM memory" +desc: "Enqueue a command to copy USM memory" class: $xEnqueue name: USMMemcpy ordinal: "0" @@ -1105,8 +1105,8 @@ params: - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command instance. -returns: + [out][optional] return an event object that identifies this particular command instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. +returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_EVENT - $X_RESULT_ERROR_INVALID_SIZE: @@ -1123,7 +1123,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- type: function -desc: "Enqueue a command to prefetch USM memory" +desc: "Enqueue a command to prefetch USM memory" class: $xEnqueue name: USMPrefetch ordinal: "0" @@ -1153,8 +1153,8 @@ params: - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command instance. -returns: + [out][optional] return an event object that identifies this particular command instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. +returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_EVENT - $X_RESULT_ERROR_INVALID_SIZE: @@ -1194,7 +1194,7 @@ params: name: phEvent desc: | [out][optional] return an event object that identifies this particular command instance. -returns: +returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_EVENT - $X_RESULT_ERROR_INVALID_SIZE: @@ -1238,11 +1238,11 @@ params: name: phEventWaitList desc: | [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. - If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. + If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular kernel execution instance. + [out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. returns: - $X_RESULT_ERROR_INVALID_SIZE: - "`pitch == 0`" @@ -1302,11 +1302,11 @@ params: name: phEventWaitList desc: | [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. - If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. + If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular kernel execution instance. + [out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. returns: - $X_RESULT_ERROR_INVALID_SIZE: - "`srcPitch == 0`" @@ -1361,11 +1361,11 @@ params: name: phEventWaitList desc: | [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. - If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. + If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular kernel execution instance. + [out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. returns: - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: - "`phEventWaitList == NULL && numEventsInWaitList > 0`" @@ -1408,11 +1408,11 @@ params: name: phEventWaitList desc: | [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. - If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. + If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular kernel execution instance. + [out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. returns: - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: - "`phEventWaitList == NULL && numEventsInWaitList > 0`" @@ -1428,28 +1428,28 @@ name: ReadHostPipe ordinal: "0" params: - type: $x_queue_handle_t - name: hQueue + name: hQueue desc: | [in] a valid host command-queue in which the read command will be queued. hQueue and hProgram must be created with the same UR context. - type: $x_program_handle_t - name: hProgram + name: hProgram desc: "[in] a program object with a successfully built executable." - type: const char* - name: pipe_symbol + name: pipe_symbol desc: "[in] the name of the program scope pipe global variable." - type: bool - name: blocking + name: blocking desc: "[in] indicate if the read operation is blocking or non-blocking." - type: void* - name: pDst + name: pDst desc: "[in] a pointer to buffer in host memory that will hold resulting data from pipe." - type: size_t - name: size + name: size desc: "[in] size of the memory region to read, in bytes." - type: uint32_t - name: numEventsInWaitList + name: numEventsInWaitList desc: "[in] number of events in the wait list." - type: const $x_event_handle_t* name: phEventWaitList @@ -1457,10 +1457,10 @@ params: [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the host pipe read. If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. - type: $x_event_handle_t* - name: phEvent + name: phEvent desc: | - [out][optional] returns an event object that identifies this read command - and can be used to query or queue a wait for this command to complete. + [out][optional] returns an event object that identifies this read command + and can be used to query or queue a wait for this command to complete. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. returns: - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: - "`phEventWaitList == NULL && numEventsInWaitList > 0`" @@ -1476,28 +1476,28 @@ name: WriteHostPipe ordinal: "0" params: - type: $x_queue_handle_t - name: hQueue + name: hQueue desc: | [in] a valid host command-queue in which the write command will be queued. hQueue and hProgram must be created with the same UR context. - type: $x_program_handle_t - name: hProgram + name: hProgram desc: "[in] a program object with a successfully built executable." - type: const char* - name: pipe_symbol + name: pipe_symbol desc: "[in] the name of the program scope pipe global variable." - type: bool - name: blocking + name: blocking desc: "[in] indicate if the read and write operations are blocking or non-blocking." - type: void* - name: pSrc + name: pSrc desc: "[in] a pointer to buffer in host memory that holds data to be written to the host pipe." - type: size_t - name: size + name: size desc: "[in] size of the memory region to read or write, in bytes." - type: uint32_t - name: numEventsInWaitList + name: numEventsInWaitList desc: "[in] number of events in the wait list." - type: const $x_event_handle_t* name: phEventWaitList @@ -1505,10 +1505,10 @@ params: [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the host pipe write. If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. - type: $x_event_handle_t* - name: phEvent + name: phEvent desc: | [out][optional] returns an event object that identifies this write command - and can be used to query or queue a wait for this command to complete. + and can be used to query or queue a wait for this command to complete. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. returns: - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: - "`phEventWaitList == NULL && numEventsInWaitList > 0`" diff --git a/scripts/core/event.yml b/scripts/core/event.yml index 45bcbf7d40..2f42cb7122 100644 --- a/scripts/core/event.yml +++ b/scripts/core/event.yml @@ -107,7 +107,7 @@ etors: - name: REFERENCE_COUNT desc: | [uint32_t] Reference count of the event object. - The reference count returned should be considered immediately stale. + The reference count returned should be considered immediately stale. It is unsuitable for general use in applications. This feature is provided for identifying memory leaks. --- #-------------------------------------------------------------------------- type: enum @@ -210,7 +210,7 @@ params: - type: "const $x_event_handle_t*" name: phEventWaitList desc: "[in][range(0, numEvents)] pointer to a list of events to wait for completion" -returns: +returns: - $X_RESULT_ERROR_INVALID_VALUE: - "`numEvents == 0`" - $X_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS: diff --git a/scripts/core/exp-bindless-images.yml b/scripts/core/exp-bindless-images.yml index 0a27d5a9d1..5648c56333 100644 --- a/scripts/core/exp-bindless-images.yml +++ b/scripts/core/exp-bindless-images.yml @@ -23,21 +23,21 @@ class: $xBindlessImages name: "$x_exp_image_mem_native_handle_t" --- #-------------------------------------------------------------------------- type: handle -desc: "Handle of interop memory" +desc: "Handle of external memory" class: $xBindlessImages -name: "$x_exp_interop_mem_handle_t" +name: "$x_exp_external_mem_handle_t" --- #-------------------------------------------------------------------------- type: handle -desc: "Handle of interop semaphore" +desc: "Handle of external semaphore" class: $xBindlessImages -name: "$x_exp_interop_semaphore_handle_t" +name: "$x_exp_external_semaphore_handle_t" --- #-------------------------------------------------------------------------- type: enum extend: true typed_etors: true desc: "Extension enums to $x_device_info_t to support bindless images." name: $x_device_info_t -etors: +etors: - name: BINDLESS_IMAGES_SUPPORT_EXP value: "0x2000" desc: "[$x_bool_t] returns true if the device supports the creation of bindless images" @@ -74,10 +74,10 @@ etors: - name: MIPMAP_LEVEL_REFERENCE_SUPPORT_EXP value: "0x200B" desc: "[$x_bool_t] returns true if the device supports using images created from individual mipmap levels" - - name: INTEROP_MEMORY_IMPORT_SUPPORT_EXP + - name: EXTERNAL_MEMORY_IMPORT_SUPPORT_EXP value: "0x200C" desc: "[$x_bool_t] returns true if the device supports importing external memory resources" - - name: INTEROP_SEMAPHORE_IMPORT_SUPPORT_EXP + - name: EXTERNAL_SEMAPHORE_IMPORT_SUPPORT_EXP value: "0x200E" desc: "[$x_bool_t] returns true if the device supports importing external semaphore resources" - name: CUBEMAP_SUPPORT_EXP @@ -118,15 +118,15 @@ type: enum extend: true desc: "Structure Type experimental enumerations." name: $x_structure_type_t -etors: +etors: - name: EXP_SAMPLER_MIP_PROPERTIES desc: $x_exp_sampler_mip_properties_t value: "0x2000" - - name: EXP_INTEROP_MEM_DESC - desc: $x_exp_interop_mem_desc_t + - name: EXP_EXTERNAL_MEM_DESC + desc: $x_exp_external_mem_desc_t value: "0x2001" - - name: EXP_INTEROP_SEMAPHORE_DESC - desc: $x_exp_interop_semaphore_desc_t + - name: EXP_EXTERNAL_SEMAPHORE_DESC + desc: $x_exp_external_semaphore_desc_t value: "0x2002" - name: EXP_FILE_DESCRIPTOR desc: $x_exp_file_descriptor_t @@ -149,10 +149,10 @@ extend: true desc: "Command Type experimental enumerations." name: $x_command_t etors: - - name: INTEROP_SEMAPHORE_WAIT_EXP + - name: EXTERNAL_SEMAPHORE_WAIT_EXP value: "0x2000" desc: Event created by $xBindlessImagesWaitExternalSemaphoreExp - - name: INTEROP_SEMAPHORE_SIGNAL_EXP + - name: EXTERNAL_SEMAPHORE_SIGNAL_EXP value: "0x2001" desc: Event created by $xBindlessImagesSignalExternalSemaphoreExp --- #-------------------------------------------------------------------------- @@ -278,16 +278,16 @@ members: desc: "[in] enables or disables seamless cubemap filtering between cubemap faces" --- #-------------------------------------------------------------------------- type: struct -desc: "Describes an interop memory resource descriptor" +desc: "Describes an external memory resource descriptor" class: $xBindlessImages -name: $x_exp_interop_mem_desc_t +name: $x_exp_external_mem_desc_t base: $x_base_desc_t members: [] --- #-------------------------------------------------------------------------- type: struct -desc: "Describes an interop semaphore resource descriptor" +desc: "Describes an external semaphore resource descriptor" class: $xBindlessImages -name: $x_exp_interop_semaphore_desc_t +name: $x_exp_external_semaphore_desc_t base: $x_base_desc_t members: [] --- #-------------------------------------------------------------------------- @@ -581,7 +581,7 @@ params: - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command instance. + [out][optional] return an event object that identifies this particular command instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_VALUE @@ -692,19 +692,19 @@ params: - type: $x_exp_external_mem_type_t name: memHandleType desc: "[in] type of external memory handle" - - type: $x_exp_interop_mem_desc_t* - name: pInteropMemDesc - desc: "[in] the interop memory descriptor" - - type: $x_exp_interop_mem_handle_t* - name: phInteropMem - desc: "[out] interop memory handle to the external memory" + - type: $x_exp_external_mem_desc_t* + name: pExternalMemDesc + desc: "[in] the external memory descriptor" + - type: $x_exp_external_mem_handle_t* + name: phExternalMem + desc: "[out] external memory handle to the external memory" returns: - $X_RESULT_ERROR_INVALID_CONTEXT - $X_RESULT_ERROR_INVALID_VALUE - $X_RESULT_ERROR_INVALID_MEM_OBJECT --- #-------------------------------------------------------------------------- type: function -desc: "Map an interop memory handle to an image memory handle" +desc: "Map an external memory handle to an image memory handle" class: $xBindlessImages name: MapExternalArrayExp ordinal: "0" @@ -721,9 +721,9 @@ params: - type: "const $x_image_desc_t*" name: pImageDesc desc: "[in] pointer to image description" - - type: $x_exp_interop_mem_handle_t - name: hInteropMem - desc: "[in] interop memory handle to the external memory" + - type: $x_exp_external_mem_handle_t + name: hExternalMem + desc: "[in] external memory handle to the external memory" - type: $x_exp_image_mem_native_handle_t* name: phImageMem desc: "[out] image memory handle to the externally allocated memory" @@ -737,9 +737,40 @@ returns: - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- type: function -desc: "Release interop memory" +desc: "Map an external memory handle to a device memory region described by void*" +class: $xBindlessImages +name: MapExternalLinearMemoryExp +ordinal: "0" +params: + - type: $x_context_handle_t + name: hContext + desc: "[in] handle of the context object" + - type: $x_device_handle_t + name: hDevice + desc: "[in] handle of the device object" + - type: uint64_t + name: offset + desc: "[in] offset into memory region to map" + - type: uint64_t + name: size + desc: "[in] size of memory region to map" + - type: $x_exp_external_mem_handle_t + name: hExternalMem + desc: "[in] external memory handle to the external memory" + - type: void** + name: ppRetMem + desc: "[out] pointer of the externally allocated memory" +returns: + - $X_RESULT_ERROR_INVALID_CONTEXT + - $X_RESULT_ERROR_INVALID_VALUE + - $X_RESULT_ERROR_INVALID_IMAGE_SIZE + - $X_RESULT_ERROR_INVALID_OPERATION + - $X_RESULT_ERROR_OUT_OF_RESOURCES +--- #-------------------------------------------------------------------------- +type: function +desc: "Release external memory" class: $xBindlessImages -name: ReleaseInteropExp +name: ReleaseExternalMemoryExp ordinal: "0" analogue: - "**cuDestroyExternalMemory**" @@ -750,9 +781,9 @@ params: - type: $x_device_handle_t name: hDevice desc: "[in] handle of the device object" - - type: $x_exp_interop_mem_handle_t - name: hInteropMem - desc: "[in][release] handle of interop memory to be destroyed" + - type: $x_exp_external_mem_handle_t + name: hExternalMem + desc: "[in][release] handle of external memory to be destroyed" returns: - $X_RESULT_ERROR_INVALID_CONTEXT - $X_RESULT_ERROR_INVALID_VALUE @@ -774,12 +805,12 @@ params: - type: $x_exp_external_semaphore_type_t name: semHandleType desc: "[in] type of external memory handle" - - type: $x_exp_interop_semaphore_desc_t* - name: pInteropSemaphoreDesc - desc: "[in] the interop semaphore descriptor" - - type: $x_exp_interop_semaphore_handle_t* - name: phInteropSemaphore - desc: "[out] interop semaphore handle to the external semaphore" + - type: $x_exp_external_semaphore_desc_t* + name: pExternalSemaphoreDesc + desc: "[in] the external semaphore descriptor" + - type: $x_exp_external_semaphore_handle_t* + name: phExternalSemaphore + desc: "[out] external semaphore handle to the external semaphore" returns: - $X_RESULT_ERROR_INVALID_CONTEXT - $X_RESULT_ERROR_INVALID_VALUE @@ -798,9 +829,9 @@ params: - type: $x_device_handle_t name: hDevice desc: "[in] handle of the device object" - - type: $x_exp_interop_semaphore_handle_t - name: hInteropSemaphore - desc: "[in][release] handle of interop semaphore to be destroyed" + - type: $x_exp_external_semaphore_handle_t + name: hExternalSemaphore + desc: "[in][release] handle of external semaphore to be destroyed" returns: - $X_RESULT_ERROR_INVALID_CONTEXT - $X_RESULT_ERROR_INVALID_VALUE @@ -816,9 +847,9 @@ params: - type: $x_queue_handle_t name: hQueue desc: "[in] handle of the queue object" - - type: $x_exp_interop_semaphore_handle_t + - type: $x_exp_external_semaphore_handle_t name: hSemaphore - desc: "[in] interop semaphore handle" + desc: "[in] external semaphore handle" - type: bool name: hasWaitValue desc: | @@ -839,7 +870,7 @@ params: - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command instance. + [out][optional] return an event object that identifies this particular command instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_VALUE @@ -855,12 +886,12 @@ params: - type: $x_queue_handle_t name: hQueue desc: "[in] handle of the queue object" - - type: $x_exp_interop_semaphore_handle_t + - type: $x_exp_external_semaphore_handle_t name: hSemaphore - desc: "[in] interop semaphore handle" + desc: "[in] external semaphore handle" - type: bool name: hasSignalValue - desc: | + desc: | [in] indicates whether the samephore is capable and should signal on a certain value. Otherwise the semaphore is treated like a binary state, and `signalValue` is ignored. - type: uint64_t @@ -878,7 +909,7 @@ params: - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command instance. + [out][optional] return an event object that identifies this particular command instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_VALUE diff --git a/scripts/core/exp-command-buffer.yml b/scripts/core/exp-command-buffer.yml index 72b4e63f74..ad15db4592 100644 --- a/scripts/core/exp-command-buffer.yml +++ b/scripts/core/exp-command-buffer.yml @@ -21,9 +21,36 @@ etors: - name: COMMAND_BUFFER_SUPPORT_EXP value: "0x1000" desc: "[$x_bool_t] Returns true if the device supports the use of command-buffers." - - name: COMMAND_BUFFER_UPDATE_SUPPORT_EXP + - name: COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP + desc: "[$x_device_command_buffer_update_capability_flags_t] Command-buffer update capabilities of the device" value: "0x1001" - desc: "[$x_bool_t] Returns true if the device supports updating the kernel commands in a command-buffer." + - name: COMMAND_BUFFER_EVENT_SUPPORT_EXP + value: "0x1002" + desc: "[$x_bool_t] Returns true if the device supports using event objects for command synchronization outside of a command-buffer." +--- #-------------------------------------------------------------------------- +type: enum +desc: "Device kernel execution capability" +class: $xDevice +name: $x_device_command_buffer_update_capability_flags_t +etors: + - name: KERNEL_ARGUMENTS + value: "$X_BIT(0)" + desc: "Device supports updating the kernel arguments in command-buffer commands." + - name: LOCAL_WORK_SIZE + value: "$X_BIT(1)" + desc: "Device supports updating the local work-group size in command-buffer commands." + - name: GLOBAL_WORK_SIZE + value: "$X_BIT(2)" + desc: "Device supports updating the global work-group size in command-buffer commands." + - name: GLOBAL_WORK_OFFSET + value: "$X_BIT(3)" + desc: "Device supports updating the global work offset in command-buffer commands." + - name: KERNEL_HANDLE + value: "$X_BIT(4)" + desc: "Device supports updating the kernel handle in command-buffer commands." + - name: EVENTS + value: "$X_BIT(5)" + desc: "Device supports updating the event parameters in command-buffer commands." --- #-------------------------------------------------------------------------- type: enum extend: true @@ -84,6 +111,12 @@ etors: [uint32_t] Reference count of the command-buffer object. The reference count returned should be considered immediately stale. It is unsuitable for general use in applications. This feature is provided for identifying memory leaks. + - name: DESCRIPTOR + desc: | + [$x_exp_command_buffer_desc_t] Returns a $x_exp_command_buffer_desc_t + with the properties of the command-buffer. Returned values may differ + from those passed on construction if the property was ignored by the + adapter. --- #-------------------------------------------------------------------------- type: enum desc: "Command-buffer command query information type" @@ -127,7 +160,7 @@ members: desc: "[in] Argument index." - type: "const ur_kernel_arg_mem_obj_properties_t *" name: pProperties - desc: "[in][optinal] Pointer to memory object properties." + desc: "[in][optional] Pointer to memory object properties." - type: $x_mem_handle_t name: hNewMemObjArg desc: "[in][optional] Handle of memory object to set at argument index." @@ -142,7 +175,7 @@ members: desc: "[in] Argument index." - type: "const ur_kernel_arg_pointer_properties_t *" name: pProperties - desc: "[in][optinal] Pointer to USM pointer properties." + desc: "[in][optional] Pointer to USM pointer properties." - type: "const void *" name: pNewPointerArg desc: "[in][optional] USM pointer to memory location holding the argument value to set at argument index." @@ -160,7 +193,7 @@ members: desc: "[in] Argument size." - type: "const ur_kernel_arg_value_properties_t *" name: pProperties - desc: "[in][optinal] Pointer to value properties." + desc: "[in][optional] Pointer to value properties." - type: "const void *" name: pNewValueArg desc: "[in][optional] Argument value representing matching kernel arg type to set at argument index." @@ -170,6 +203,12 @@ desc: "Descriptor type for updating a kernel launch command." base: $x_base_desc_t name: $x_exp_command_buffer_update_kernel_launch_desc_t members: + - type: $x_kernel_handle_t + name: hNewKernel + desc: | + [in][optional] The new kernel handle. If this parameter is nullptr, the current kernel handle in `hCommand` + will be used. If a kernel handle is passed, it must be a valid kernel alternative as defined in + $xCommandBufferAppendKernelLaunchExp. - type: uint32_t name: numNewMemObjArgs desc: "[in] Length of pNewMemObjArgList." @@ -193,13 +232,23 @@ members: desc: "[in][optional][range(0, numNewValueArgs)] An array describing the new kernel value arguments for the command." - type: "size_t*" name: pNewGlobalWorkOffset - desc: "[in][optional][range(0, newWorkDim)] Array of newWorkDim unsigned values that describe the offset used to calculate the global ID." + desc: | + [in][optional][range(0, newWorkDim)] Array of newWorkDim unsigned values that describe the offset used + to calculate the global ID. If this parameter is nullptr, the current global work offset will be used. This parameter is required if `newWorkDim` is different from the current work dimensions + in the command. - type: "size_t*" name: pNewGlobalWorkSize - desc: "[in][optional][range(0, newWorkDim)] Array of newWorkDim unsigned values that describe the number of global work-items." + desc: | + [in][optional][range(0, newWorkDim)] Array of newWorkDim unsigned values that describe the number of + global work-items. If this parameter is nullptr, the current global work size in `hCommand` will be used. + This parameter is required if `newWorkDim` is different from the current work dimensions in the command. - type: "size_t*" name: pNewLocalWorkSize - desc: "[in][optional][range(0, newWorkDim)] Array of newWorkDim unsigned values that describe the number of work-items that make up a work-group. If newWorkDim is non-zero and pNewLocalWorkSize is nullptr, then runtime implementation will choose the work-group size. If newWorkDim is zero and pNewLocalWorkSize is nullptr, then the local work size is unchanged." + desc: | + [in][optional][range(0, newWorkDim)] Array of newWorkDim unsigned values that describe the number of + work-items that make up a work-group. If `pNewGlobalWorkSize` is set and `pNewLocalWorkSize` is nullptr, + then the runtime implementation will choose the local work size. If `pNewGlobalWorkSize` is nullptr and + `pNewLocalWorkSize` is nullptr, the current local work size in the command will be used. --- #-------------------------------------------------------------------------- type: typedef desc: "A value that identifies a command inside of a command-buffer, used for defining dependencies between commands in the same command-buffer." @@ -306,7 +355,16 @@ params: desc: "[in] Global work size to use when executing kernel." - type: "const size_t*" name: pLocalWorkSize - desc: "[in][optional] Local work size to use when executing kernel." + desc: "[in][optional] Local work size to use when executing kernel. If this parameter is nullptr, then a local work size will be generated by the implementation." + - type: uint32_t + name: "numKernelAlternatives" + desc: "[in] The number of kernel alternatives provided in phKernelAlternatives." + - type: $x_kernel_handle_t* + name: "phKernelAlternatives" + desc: | + [in][optional][range(0, numKernelAlternatives)] List of kernel handles that might be used to update the kernel in this + command after the command-buffer is finalized. The default kernel `hKernel` is implicitly marked as an alternative. It's + invalid to specify it as part of this list. - type: uint32_t name: numSyncPointsInWaitList desc: "[in] The number of sync points in the provided dependency list." @@ -314,24 +372,46 @@ params: name: pSyncPointWaitList desc: "[in][optional] A list of sync points that this command depends on. May be ignored if command-buffer is in-order." + - type: uint32_t + name: numEventsInWaitList + desc: "[in] Size of the event wait list." + - type: "const $x_event_handle_t*" + name: phEventWaitList + desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the command execution. If nullptr, the numEventsInWaitList must be 0, indicating no wait events." - type: "$x_exp_command_buffer_sync_point_t*" name: pSyncPoint desc: "[out][optional] Sync point associated with this command." + - type: $x_event_handle_t* + name: phEvent + desc: "[out][optional] return an event object that will be signaled by the completion of this command in the next execution of the command-buffer." - type: "$x_exp_command_buffer_command_handle_t*" name: phCommand - desc: "[out][optional] Handle to this command." + desc: "[out][optional] Handle to this command. Only available if the + command-buffer is updatable." returns: - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP - $X_RESULT_ERROR_INVALID_KERNEL - $X_RESULT_ERROR_INVALID_WORK_DIMENSION - $X_RESULT_ERROR_INVALID_WORK_GROUP_SIZE - - $X_RESULT_ERROR_INVALID_VALUE + - $X_RESULT_ERROR_INVALID_VALUE: + - "`phKernelAlternatives == NULL && numKernelAlternatives > 0`" + - "`phKernelAlternatives != NULL && numKernelAlternatives == 0`" + - "If `phKernelAlternatives` contains `hKernel`" - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP: - "`pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0`" - "`pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0`" + - $X_RESULT_ERROR_INVALID_EVENT + - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: + - "`phEventWaitList == NULL && numEventsInWaitList > 0`" + - "`phEventWaitList != NULL && numEventsInWaitList == 0`" + - "If event objects in phEventWaitList are not valid events." + - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: + - "If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL." - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - $X_RESULT_ERROR_OUT_OF_RESOURCES + - $X_RESULT_ERROR_INVALID_OPERATION + - "phCommand is not NULL and hCommandBuffer is not updatable." --- #-------------------------------------------------------------------------- type: function desc: "Append a USM memcpy command to a command-buffer object." @@ -357,9 +437,21 @@ params: name: pSyncPointWaitList desc: "[in][optional] A list of sync points that this command depends on. May be ignored if command-buffer is in-order." + - type: uint32_t + name: numEventsInWaitList + desc: "[in] Size of the event wait list." + - type: "const $x_event_handle_t*" + name: phEventWaitList + desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the command execution. If nullptr, the numEventsInWaitList must be 0, indicating no wait events." - type: "$x_exp_command_buffer_sync_point_t*" name: pSyncPoint desc: "[out][optional] Sync point associated with this command." + - type: $x_event_handle_t* + name: phEvent + desc: "[out][optional] return an event object that will be signaled by the completion of this command in the next execution of the command-buffer." + - type: "$x_exp_command_buffer_command_handle_t*" + name: phCommand + desc: "[out][optional] Handle to this command." returns: - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP - $X_RESULT_ERROR_INVALID_SIZE: @@ -370,6 +462,13 @@ returns: - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP: - "`pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0`" - "`pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0`" + - $X_RESULT_ERROR_INVALID_EVENT + - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: + - "`phEventWaitList == NULL && numEventsInWaitList > 0`" + - "`phEventWaitList != NULL && numEventsInWaitList == 0`" + - "If event objects in phEventWaitList are not valid events." + - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: + - "If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL." - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- @@ -400,9 +499,21 @@ params: name: pSyncPointWaitList desc: "[in][optional] A list of sync points that this command depends on. May be ignored if command-buffer is in-order." + - type: uint32_t + name: numEventsInWaitList + desc: "[in] Size of the event wait list." + - type: "const $x_event_handle_t*" + name: phEventWaitList + desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the command execution. If nullptr, the numEventsInWaitList must be 0, indicating no wait events." - type: "$x_exp_command_buffer_sync_point_t*" name: pSyncPoint desc: "[out][optional] sync point associated with this command." + - type: $x_event_handle_t* + name: phEvent + desc: "[out][optional] return an event object that will be signaled by the completion of this command in the next execution of the command-buffer." + - type: "$x_exp_command_buffer_command_handle_t*" + name: phCommand + desc: "[out][optional] Handle to this command." returns: - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP - $X_RESULT_ERROR_INVALID_SIZE: @@ -415,6 +526,13 @@ returns: - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP: - "`pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0`" - "`pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0`" + - $X_RESULT_ERROR_INVALID_EVENT + - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: + - "`phEventWaitList == NULL && numEventsInWaitList > 0`" + - "`phEventWaitList != NULL && numEventsInWaitList == 0`" + - "If event objects in phEventWaitList are not valid events." + - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: + - "If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL." - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- @@ -448,9 +566,21 @@ params: name: pSyncPointWaitList desc: "[in][optional] A list of sync points that this command depends on. May be ignored if command-buffer is in-order." + - type: uint32_t + name: numEventsInWaitList + desc: "[in] Size of the event wait list." + - type: "const $x_event_handle_t*" + name: phEventWaitList + desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the command execution. If nullptr, the numEventsInWaitList must be 0, indicating no wait events." - type: "$x_exp_command_buffer_sync_point_t*" name: pSyncPoint desc: "[out][optional] Sync point associated with this command." + - type: $x_event_handle_t* + name: phEvent + desc: "[out][optional] return an event object that will be signaled by the completion of this command in the next execution of the command-buffer." + - type: "$x_exp_command_buffer_command_handle_t*" + name: phCommand + desc: "[out][optional] Handle to this command." returns: - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP @@ -458,6 +588,13 @@ returns: - "`pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0`" - "`pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0`" - $X_RESULT_ERROR_INVALID_MEM_OBJECT + - $X_RESULT_ERROR_INVALID_EVENT + - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: + - "`phEventWaitList == NULL && numEventsInWaitList > 0`" + - "`phEventWaitList != NULL && numEventsInWaitList == 0`" + - "If event objects in phEventWaitList are not valid events." + - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: + - "If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL." - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- @@ -488,9 +625,21 @@ params: name: pSyncPointWaitList desc: "[in][optional] A list of sync points that this command depends on. May be ignored if command-buffer is in-order." + - type: uint32_t + name: numEventsInWaitList + desc: "[in] Size of the event wait list." + - type: "const $x_event_handle_t*" + name: phEventWaitList + desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the command execution. If nullptr, the numEventsInWaitList must be 0, indicating no wait events." - type: "$x_exp_command_buffer_sync_point_t*" name: pSyncPoint desc: "[out][optional] Sync point associated with this command." + - type: $x_event_handle_t* + name: phEvent + desc: "[out][optional] return an event object that will be signaled by the completion of this command in the next execution of the command-buffer." + - type: "$x_exp_command_buffer_command_handle_t*" + name: phCommand + desc: "[out][optional] Handle to this command." returns: - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP @@ -498,6 +647,13 @@ returns: - "`pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0`" - "`pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0`" - $X_RESULT_ERROR_INVALID_MEM_OBJECT + - $X_RESULT_ERROR_INVALID_EVENT + - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: + - "`phEventWaitList == NULL && numEventsInWaitList > 0`" + - "`phEventWaitList != NULL && numEventsInWaitList == 0`" + - "If event objects in phEventWaitList are not valid events." + - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: + - "If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL." - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- @@ -528,9 +684,21 @@ params: name: pSyncPointWaitList desc: "[in][optional] A list of sync points that this command depends on. May be ignored if command-buffer is in-order." + - type: uint32_t + name: numEventsInWaitList + desc: "[in] Size of the event wait list." + - type: "const $x_event_handle_t*" + name: phEventWaitList + desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the command execution. If nullptr, the numEventsInWaitList must be 0, indicating no wait events." - type: "$x_exp_command_buffer_sync_point_t*" name: pSyncPoint desc: "[out][optional] Sync point associated with this command." + - type: $x_event_handle_t* + name: phEvent + desc: "[out][optional] return an event object that will be signaled by the completion of this command in the next execution of the command-buffer." + - type: "$x_exp_command_buffer_command_handle_t*" + name: phCommand + desc: "[out][optional] Handle to this command." returns: - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP @@ -538,6 +706,13 @@ returns: - "`pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0`" - "`pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0`" - $X_RESULT_ERROR_INVALID_MEM_OBJECT + - $X_RESULT_ERROR_INVALID_EVENT + - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: + - "`phEventWaitList == NULL && numEventsInWaitList > 0`" + - "`phEventWaitList != NULL && numEventsInWaitList == 0`" + - "If event objects in phEventWaitList are not valid events." + - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: + - "If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL." - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- @@ -583,9 +758,21 @@ params: name: pSyncPointWaitList desc: "[in][optional] A list of sync points that this command depends on. May be ignored if command-buffer is in-order." + - type: uint32_t + name: numEventsInWaitList + desc: "[in] Size of the event wait list." + - type: "const $x_event_handle_t*" + name: phEventWaitList + desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the command execution. If nullptr, the numEventsInWaitList must be 0, indicating no wait events." - type: $x_exp_command_buffer_sync_point_t* name: pSyncPoint desc: "[out][optional] Sync point associated with this command." + - type: $x_event_handle_t* + name: phEvent + desc: "[out][optional] return an event object that will be signaled by the completion of this command in the next execution of the command-buffer." + - type: "$x_exp_command_buffer_command_handle_t*" + name: phCommand + desc: "[out][optional] Handle to this command." returns: - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP @@ -593,6 +780,13 @@ returns: - "`pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0`" - "`pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0`" - $X_RESULT_ERROR_INVALID_MEM_OBJECT + - $X_RESULT_ERROR_INVALID_EVENT + - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: + - "`phEventWaitList == NULL && numEventsInWaitList > 0`" + - "`phEventWaitList != NULL && numEventsInWaitList == 0`" + - "If event objects in phEventWaitList are not valid events." + - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: + - "If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL." - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- @@ -638,9 +832,21 @@ params: name: pSyncPointWaitList desc: "[in][optional] A list of sync points that this command depends on. May be ignored if command-buffer is in-order." + - type: uint32_t + name: numEventsInWaitList + desc: "[in] Size of the event wait list." + - type: "const $x_event_handle_t*" + name: phEventWaitList + desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the command execution. If nullptr, the numEventsInWaitList must be 0, indicating no wait events." - type: $x_exp_command_buffer_sync_point_t* name: pSyncPoint desc: "[out][optional] Sync point associated with this command." + - type: $x_event_handle_t* + name: phEvent + desc: "[out][optional] return an event object that will be signaled by the completion of this command in the next execution of the command-buffer." + - type: "$x_exp_command_buffer_command_handle_t*" + name: phCommand + desc: "[out][optional] Handle to this command." returns: - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP @@ -648,6 +854,13 @@ returns: - "`pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0`" - "`pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0`" - $X_RESULT_ERROR_INVALID_MEM_OBJECT + - $X_RESULT_ERROR_INVALID_EVENT + - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: + - "`phEventWaitList == NULL && numEventsInWaitList > 0`" + - "`phEventWaitList != NULL && numEventsInWaitList == 0`" + - "If event objects in phEventWaitList are not valid events." + - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: + - "If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL." - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- @@ -693,9 +906,21 @@ params: name: pSyncPointWaitList desc: "[in][optional] A list of sync points that this command depends on. May be ignored if command-buffer is in-order." + - type: uint32_t + name: numEventsInWaitList + desc: "[in] Size of the event wait list." + - type: "const $x_event_handle_t*" + name: phEventWaitList + desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the command execution. If nullptr, the numEventsInWaitList must be 0, indicating no wait events." - type: $x_exp_command_buffer_sync_point_t* name: pSyncPoint desc: "[out][optional] Sync point associated with this command." + - type: $x_event_handle_t* + name: phEvent + desc: "[out][optional] return an event object that will be signaled by the completion of this command in the next execution of the command-buffer." + - type: "$x_exp_command_buffer_command_handle_t*" + name: phCommand + desc: "[out][optional] Handle to this command." returns: - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP @@ -703,6 +928,13 @@ returns: - "`pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0`" - "`pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0`" - $X_RESULT_ERROR_INVALID_MEM_OBJECT + - $X_RESULT_ERROR_INVALID_EVENT + - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: + - "`phEventWaitList == NULL && numEventsInWaitList > 0`" + - "`phEventWaitList != NULL && numEventsInWaitList == 0`" + - "If event objects in phEventWaitList are not valid events." + - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: + - "If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL." - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- @@ -736,9 +968,21 @@ params: name: pSyncPointWaitList desc: "[in][optional] A list of sync points that this command depends on. May be ignored if command-buffer is in-order." + - type: uint32_t + name: numEventsInWaitList + desc: "[in] Size of the event wait list." + - type: "const $x_event_handle_t*" + name: phEventWaitList + desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the command execution. If nullptr, the numEventsInWaitList must be 0, indicating no wait events." - type: $x_exp_command_buffer_sync_point_t* name: pSyncPoint desc: "[out][optional] sync point associated with this command." + - type: $x_event_handle_t* + name: phEvent + desc: "[out][optional] return an event object that will be signaled by the completion of this command in the next execution of the command-buffer." + - type: "$x_exp_command_buffer_command_handle_t*" + name: phCommand + desc: "[out][optional] Handle to this command." returns: - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP @@ -748,6 +992,13 @@ returns: - $X_RESULT_ERROR_INVALID_MEM_OBJECT - $X_RESULT_ERROR_INVALID_SIZE: - "If `offset + size` results in an out-of-bounds access." + - $X_RESULT_ERROR_INVALID_EVENT + - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: + - "`phEventWaitList == NULL && numEventsInWaitList > 0`" + - "`phEventWaitList != NULL && numEventsInWaitList == 0`" + - "If event objects in phEventWaitList are not valid events." + - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: + - "If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL." - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- @@ -778,9 +1029,21 @@ params: name: pSyncPointWaitList desc: "[in][optional] A list of sync points that this command depends on. May be ignored if command-buffer is in-order." + - type: uint32_t + name: numEventsInWaitList + desc: "[in] Size of the event wait list." + - type: "const $x_event_handle_t*" + name: phEventWaitList + desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the command execution. If nullptr, the numEventsInWaitList must be 0, indicating no wait events." - type: "$x_exp_command_buffer_sync_point_t*" name: pSyncPoint desc: "[out][optional] sync point associated with this command." + - type: $x_event_handle_t* + name: phEvent + desc: "[out][optional] return an event object that will be signaled by the completion of this command in the next execution of the command-buffer." + - type: "$x_exp_command_buffer_command_handle_t*" + name: phCommand + desc: "[out][optional] Handle to this command." returns: - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP @@ -791,6 +1054,13 @@ returns: - $X_RESULT_ERROR_INVALID_SIZE: - "`size == 0`" - "If `size` is higher than the allocation size of `pMemory`" + - $X_RESULT_ERROR_INVALID_EVENT + - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: + - "`phEventWaitList == NULL && numEventsInWaitList > 0`" + - "`phEventWaitList != NULL && numEventsInWaitList == 0`" + - "If event objects in phEventWaitList are not valid events." + - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: + - "If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL." - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- @@ -822,9 +1092,21 @@ params: name: pSyncPointWaitList desc: "[in][optional] A list of sync points that this command depends on. May be ignored if command-buffer is in-order." + - type: uint32_t + name: numEventsInWaitList + desc: "[in] Size of the event wait list." + - type: "const $x_event_handle_t*" + name: phEventWaitList + desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the command execution. If nullptr, the numEventsInWaitList must be 0, indicating no wait events." - type: "$x_exp_command_buffer_sync_point_t*" name: pSyncPoint desc: "[out][optional] sync point associated with this command." + - type: $x_event_handle_t* + name: phEvent + desc: "[out][optional] return an event object that will be signaled by the completion of this command in the next execution of the command-buffer." + - type: "$x_exp_command_buffer_command_handle_t*" + name: phCommand + desc: "[out][optional] Handle to this command." returns: - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP @@ -835,6 +1117,13 @@ returns: - $X_RESULT_ERROR_INVALID_SIZE: - "`size == 0`" - "If `size` is higher than the allocation size of `pMemory`" + - $X_RESULT_ERROR_INVALID_EVENT + - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: + - "`phEventWaitList == NULL && numEventsInWaitList > 0`" + - "`phEventWaitList != NULL && numEventsInWaitList == 0`" + - "If event objects in phEventWaitList are not valid events." + - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: + - "If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL." - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- @@ -860,7 +1149,7 @@ params: - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular command-buffer execution instance. + [out][optional] return an event object that identifies this particular command-buffer execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. returns: - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP - $X_RESULT_ERROR_INVALID_QUEUE @@ -899,7 +1188,8 @@ returns: - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY --- #-------------------------------------------------------------------------- type: function -desc: "Update a kernel launch command in a finalized command-buffer. This entry-point is synchronous and may block if the command-buffer is executing when the entry-point is called." +desc: "Update a kernel launch command in a finalized command-buffer." +details: "This entry-point is synchronous and may block if the command-buffer is executing when the entry-point is called." class: $xCommandBuffer name: UpdateKernelLaunchExp params: @@ -909,29 +1199,81 @@ params: - type: "const $x_exp_command_buffer_update_kernel_launch_desc_t*" name: pUpdateKernelLaunch desc: "[in] Struct defining how the kernel command is to be updated." - returns: - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: - "If update functionality is not supported by the device." - $X_RESULT_ERROR_INVALID_OPERATION: - "If $x_exp_command_buffer_desc_t::isUpdatable was not set to true on creation of the command buffer `hCommand` belongs to." - "If the command-buffer `hCommand` belongs to has not been finalized." - - "If `pUpdateKernellaunch->newWorkDim` is non-zero and different from the work-dim used on creation of `hCommand`." - - "If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a non-NULL value and `pUpdateKernelLaunch->pNewGlobalWorkSize` is NULL." - - "If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a non-NULL value when `hCommand` was created with a NULL local work size." - - "If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a NULL value when `hCommand` was created with a non-NULL local work size." - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_COMMAND_HANDLE_EXP + - "If `hCommand` is not a kernel execution command." - $X_RESULT_ERROR_INVALID_MEM_OBJECT - $X_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX - $X_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE - $X_RESULT_ERROR_INVALID_ENUMERATION - - $X_RESULT_ERROR_INVALID_WORK_DIMENSION + - $X_RESULT_ERROR_INVALID_WORK_DIMENSION: + - "`pUpdateKernelLaunch->newWorkDim < 1 || pUpdateKernelLaunch->newWorkDim > 3`" - $X_RESULT_ERROR_INVALID_WORK_GROUP_SIZE + - $X_RESULT_ERROR_INVALID_VALUE: + - "If `pUpdateKernelLaunch->hNewKernel` was not passed to the `hKernel` or `phKernelAlternatives` parameters of $xCommandBufferAppendKernelLaunchExp when this command was created." + - "If `pUpdateKernelLaunch->newWorkDim` is different from the current workDim in `hCommand` and, pUpdateKernelLaunch->pNewGlobalWorkSize, or pUpdateKernelLaunch->pNewGlobalWorkOffset are nullptr." + - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY + - $X_RESULT_ERROR_OUT_OF_RESOURCES +--- #-------------------------------------------------------------------------- +type: function +desc: "Get a new event that will be signaled the next time the command in the command-buffer executes." +details: "It is the users responsibility to release the returned `phSignalEvent`." +class: $xCommandBuffer +name: UpdateSignalEventExp +params: + - type: $x_exp_command_buffer_command_handle_t + name: hCommand + desc: "[in] Handle of the command-buffer command to update." + - type: "$x_event_handle_t*" + name: phSignalEvent + desc: "[out] Event to be signaled." +returns: + - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: + - "If UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_EVENTS is not supported by the device associated with `hCommand`." + - $X_RESULT_ERROR_INVALID_OPERATION: + - "If $x_exp_command_buffer_desc_t::isUpdatable was not set to true on creation of the command buffer `hCommand` belongs to." + - "If the command-buffer `hCommand` belongs to has not been finalized." + - "If no `phEvent` parameter was set on creation of the command associated with `hCommand`." + - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_COMMAND_HANDLE_EXP - $X_RESULT_ERROR_INVALID_VALUE - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- type: function +desc: "Set the list of wait events for a command to depend on to a list of new events." +class: $xCommandBuffer +name: UpdateWaitEventsExp +params: + - type: $x_exp_command_buffer_command_handle_t + name: hCommand + desc: "[in] Handle of the command-buffer command to update." + - type: uint32_t + name: numEventsInWaitList + desc: "[in] Size of the event wait list." + - type: "const $x_event_handle_t*" + name: phEventWaitList + desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the command execution. If nullptr, the numEventsInWaitList must be 0, indicating no wait events." +returns: + - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: + - "If UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_EVENTS is not supported by the device associated with `hCommand`." + - $X_RESULT_ERROR_INVALID_OPERATION: + - "If $x_exp_command_buffer_desc_t::isUpdatable was not set to true on creation of the command buffer `hCommand` belongs to." + - "If the command-buffer `hCommand` belongs to has not been finalized." + - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_COMMAND_HANDLE_EXP + - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: + - "`phEventWaitList == NULL && numEventsInWaitList > 0`" + - "`phEventWaitList != NULL && numEventsInWaitList == 0`" + - "If event objects in phEventWaitList are not valid events." + - "If `numEventsInWaitList` does not match the number of wait events set when the command associated with `hCommand` was created." + - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY + - $X_RESULT_ERROR_OUT_OF_RESOURCES +--- #-------------------------------------------------------------------------- +type: function desc: "Get command-buffer object information." class: $xCommandBuffer name: GetInfoExp diff --git a/scripts/core/exp-cooperative-kernels.yml b/scripts/core/exp-cooperative-kernels.yml index 2262f9433b..941aba29fa 100644 --- a/scripts/core/exp-cooperative-kernels.yml +++ b/scripts/core/exp-cooperative-kernels.yml @@ -37,13 +37,13 @@ params: name: pGlobalWorkOffset desc: "[in] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item" - type: "const size_t*" - name: pGlobalWorkSize + name: pGlobalWorkSize desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function" - type: "const size_t*" name: pLocalWorkSize desc: | [in][optional] pointer to an array of workDim unsigned values that specify the number of local work-items forming a work-group that will execute the kernel function. - If nullptr, the runtime implementation will choose the work-group size. + If nullptr, the runtime implementation will choose the work-group size. - type: uint32_t name: numEventsInWaitList desc: "[in] size of the event wait list" @@ -51,11 +51,11 @@ params: name: phEventWaitList desc: | [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. - If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. + If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. - type: $x_event_handle_t* name: phEvent desc: | - [out][optional] return an event object that identifies this particular kernel execution instance. + [out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_KERNEL @@ -87,5 +87,5 @@ params: - type: "uint32_t*" name: "pGroupCountRet" desc: "[out] pointer to maximum number of groups" -returns: +returns: - $X_RESULT_ERROR_INVALID_KERNEL diff --git a/scripts/core/exp-enqueue-timestamp-recording.yml b/scripts/core/exp-enqueue-timestamp-recording.yml index d8eff2a6cc..e81906f9ae 100644 --- a/scripts/core/exp-enqueue-timestamp-recording.yml +++ b/scripts/core/exp-enqueue-timestamp-recording.yml @@ -17,7 +17,7 @@ extend: true typed_etors: true desc: "Extension enums to $x_device_info_t to support timestamp recordings." name: $x_device_info_t -etors: +etors: - name: TIMESTAMP_RECORDING_SUPPORT_EXP value: "0x2018" desc: "[$x_bool_t] returns true if the device supports timestamp recording" @@ -52,14 +52,14 @@ params: name: phEventWaitList desc: | [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. - If nullptr, the numEventsInWaitList must be 0, indicating no wait events. + If nullptr, the numEventsInWaitList must be 0, indicating no wait events. - type: $x_event_handle_t* name: phEvent desc: | [in,out] return an event object that identifies this particular kernel execution instance. Profiling information can be queried from this event as if `hQueue` had profiling enabled. Querying `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` reports the timestamp at the time of the call to this function. Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` - reports the timestamp recorded when the command is executed on the device. + reports the timestamp recorded when the command is executed on the device. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. returns: - $X_RESULT_ERROR_INVALID_NULL_HANDLE - $X_RESULT_ERROR_INVALID_NULL_POINTER diff --git a/scripts/core/exp-launch-properties.yml b/scripts/core/exp-launch-properties.yml index aef4b2844a..9e66e9ea06 100644 --- a/scripts/core/exp-launch-properties.yml +++ b/scripts/core/exp-launch-properties.yml @@ -102,7 +102,7 @@ params: desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. " - type: ur_event_handle_t* name: phEvent - desc: "[out][optional] return an event object that identifies this particular kernel execution instance." + desc: "[out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array." returns: - $X_RESULT_SUCCESS - $X_RESULT_ERROR_UNINITIALIZED @@ -138,4 +138,3 @@ etors: - name: CLUSTER_LAUNCH_EXP value: "0x1111" desc: "[$x_bool_t] return true if enqueue Cluster Launch is supported" - diff --git a/scripts/core/exp-multi-device-compile.yml b/scripts/core/exp-multi-device-compile.yml index 8ccba8c623..ddd1fcf541 100644 --- a/scripts/core/exp-multi-device-compile.yml +++ b/scripts/core/exp-multi-device-compile.yml @@ -41,7 +41,7 @@ params: desc: "[in] number of devices" - type: $x_device_handle_t* name: phDevices - desc: "[in][range(0, numDevices)] pointer to array of device handles" + desc: "[in][range(0, numDevices)] pointer to array of device handles" - type: const char* name: pOptions desc: "[in][optional] pointer to build options null-terminated string." @@ -72,7 +72,7 @@ params: desc: "[in] number of devices" - type: $x_device_handle_t* name: phDevices - desc: "[in][range(0, numDevices)] pointer to array of device handles" + desc: "[in][range(0, numDevices)] pointer to array of device handles" - type: const char* name: pOptions desc: "[in][optional] pointer to build options null-terminated string." @@ -104,7 +104,7 @@ params: desc: "[in] number of devices" - type: $x_device_handle_t* name: phDevices - desc: "[in][range(0, numDevices)] pointer to array of device handles" + desc: "[in][range(0, numDevices)] pointer to array of device handles" - type: uint32_t name: count desc: "[in] number of program handles in `phPrograms`." diff --git a/scripts/core/exp-native-enqueue.yml b/scripts/core/exp-native-enqueue.yml index e726fe623f..fd8902a41e 100644 --- a/scripts/core/exp-native-enqueue.yml +++ b/scripts/core/exp-native-enqueue.yml @@ -112,7 +112,7 @@ params: name: phEvent desc: | [out][optional] return an event object that identifies the work that has - been enqueued in nativeEnqueueFunc. + been enqueued in nativeEnqueueFunc. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. returns: - $X_RESULT_ERROR_INVALID_NULL_HANDLE - $X_RESULT_ERROR_INVALID_NULL_POINTER diff --git a/scripts/core/kernel.yml b/scripts/core/kernel.yml index 4815c8413a..5bd95e1847 100644 --- a/scripts/core/kernel.yml +++ b/scripts/core/kernel.yml @@ -64,7 +64,9 @@ params: desc: "[in][optional] pointer to value properties." - type: "const void*" name: pArgValue - desc: "[in] argument value represented as matching arg type." + desc: | + [in] argument value represented as matching arg type. + The data pointed to will be copied and therefore can be reused on return. returns: - $X_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX - $X_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE @@ -113,7 +115,7 @@ etors: - name: REFERENCE_COUNT desc: | [uint32_t] Reference count of the kernel object. - The reference count returned should be considered immediately stale. + The reference count returned should be considered immediately stale. It is unsuitable for general use in applications. This feature is provided for identifying memory leaks. - name: CONTEXT desc: "[$x_context_handle_t] Return Context object associated with Kernel." @@ -142,6 +144,14 @@ etors: desc: "[size_t] Return preferred multiple of Work Group size for launch" - name: PRIVATE_MEM_SIZE desc: "[size_t] Return minimum amount of private memory in bytes used by each work item in the Kernel" + - name: COMPILE_MAX_WORK_GROUP_SIZE + desc: | + [size_t[3]] Return the maximum Work Group size guaranteed by the + source code, or (0, 0, 0) if unspecified + - name: COMPILE_MAX_LINEAR_WORK_GROUP_SIZE + desc: | + [size_t] Return the maximum linearized Work Group size (X * Y * Z) + guaranteed by the source code, or 0 if unspecified --- #-------------------------------------------------------------------------- type: enum desc: "Get Kernel SubGroup information" @@ -200,12 +210,12 @@ params: - type: "size_t" name: propSize desc: | - [in] the size of the Kernel property value. + [in] the size of the Kernel property value. - type: "void*" name: pPropValue desc: | [in,out][optional][typename(propName, propSize)] array of bytes holding the kernel info property. - If propSize is not equal to or greater than the real number of bytes needed to return + If propSize is not equal to or greater than the real number of bytes needed to return the info then the $X_RESULT_ERROR_INVALID_SIZE error is returned and pPropValue is not used. - type: "size_t*" name: "pPropSizeRet" @@ -465,6 +475,11 @@ returns: - "`count == 0`" - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: - "If $X_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS query is false" + - $X_RESULT_ERROR_INVALID_VALUE: + - "A pSpecConstant entry contains a size that does not match that of the specialization constant in the module." + - "A pSpecConstant entry contains a nullptr pValue." + - $X_RESULT_ERROR_INVALID_SPEC_ID: + - "Any id specified in a pSpecConstant entry is not a valid specialization constant identifier." --- #-------------------------------------------------------------------------- type: function desc: "Return platform native kernel handle." @@ -566,10 +581,10 @@ params: [in] pointer to an array of numWorkDim unsigned values that specify the offset used to calculate the global ID of a work-item - type: const size_t* - name: pGlobalWorkSize + name: pGlobalWorkSize desc: | [in] pointer to an array of numWorkDim unsigned values that specify - the number of global work-items in workDim that will execute the + the number of global work-items in workDim that will execute the kernel function - type: size_t* name: pSuggestedLocalWorkSize diff --git a/scripts/core/loader.yml b/scripts/core/loader.yml index fc02e60ef4..8b0caf3b67 100644 --- a/scripts/core/loader.yml +++ b/scripts/core/loader.yml @@ -135,7 +135,7 @@ params: desc: "[in] Handle to config object the layer will be enabled for." - type: const char* name: pLayerName - desc: "[in] Null terminated string containing the name of the layer to enable." + desc: "[in] Null terminated string containing the name of the layer to enable. Empty if none are enabled." returns: - $X_RESULT_ERROR_LAYER_NOT_PRESENT: - "If layer specified with `pLayerName` can't be found by the loader." @@ -210,7 +210,7 @@ loader_only: True name: Init decl: static ordinal: "0" -details: +details: - "The application must call this function before calling any other function." - "If this function is not called then all other functions will return $X_RESULT_ERROR_UNINITIALIZED." - "Only one instance of the loader will be initialized per process." diff --git a/scripts/core/memory.yml b/scripts/core/memory.yml index 4df4ae0d0b..0fde537f37 100644 --- a/scripts/core/memory.yml +++ b/scripts/core/memory.yml @@ -211,12 +211,12 @@ analogue: - "**clCreateImage**" details: - | - The primary $x_image_format_t that must be supported by all the adapters are {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT8}, - {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT16}, {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT8}, - {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT16}, {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8}, - {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16}, {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32}, - {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8}, {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16}, - {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32}, {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT}, + The primary $x_image_format_t that must be supported by all the adapters are {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT8}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT16}, {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT8}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT16}, {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16}, {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8}, {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32}, {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT}, {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_FLOAT}. params: - type: $x_context_handle_t diff --git a/scripts/core/platform.yml b/scripts/core/platform.yml index f3069005ab..a1aa0dc7ca 100644 --- a/scripts/core/platform.yml +++ b/scripts/core/platform.yml @@ -33,7 +33,7 @@ params: - type: "uint32_t" name: NumEntries desc: | - [in] the number of platforms to be added to phPlatforms. + [in] the number of platforms to be added to phPlatforms. If phPlatforms is not NULL, then NumEntries should be greater than zero, otherwise $X_RESULT_ERROR_INVALID_SIZE, will be returned. - type: "$x_platform_handle_t*" @@ -44,7 +44,7 @@ params: - type: "uint32_t*" name: "pNumPlatforms" desc: | - [out][optional] returns the total number of platforms available. + [out][optional] returns the total number of platforms available. returns: - $X_RESULT_ERROR_INVALID_SIZE: - "`NumEntries == 0 && phPlatforms != NULL`" @@ -143,6 +143,9 @@ etors: - name: "0_10" value: "$X_MAKE_VERSION( 0, 10 )" desc: "version 0.10" + - name: "0_11" + value: "$X_MAKE_VERSION( 0, 11 )" + desc: "version 0.11" --- #-------------------------------------------------------------------------- type: function desc: "Returns the API version supported by the specified platform" diff --git a/scripts/core/program.yml b/scripts/core/program.yml index b7da9d62e7..23f07d4287 100644 --- a/scripts/core/program.yml +++ b/scripts/core/program.yml @@ -366,7 +366,7 @@ etors: - name: REFERENCE_COUNT desc: | [uint32_t] Reference count of the program object. - The reference count returned should be considered immediately stale. + The reference count returned should be considered immediately stale. It is unsuitable for general use in applications. This feature is provided for identifying memory leaks. - name: CONTEXT desc: "[$x_context_handle_t] Program context info." @@ -376,8 +376,8 @@ etors: desc: | [$x_device_handle_t[]] Return list of devices associated with a program. This is either the list of devices associated with the context or a subset of those devices when the program is created using $xProgramCreateWithBinary. - - name: SOURCE - desc: "[char[]] Return program source associated with Program." + - name: IL + desc: "[char[]] Return program IL if the program was created with $xProgramCreateWithIL, otherwise return size will be set to 0 and nothing will be returned." - name: BINARY_SIZES desc: "[size_t[]] Return program binary sizes for each device." - name: BINARIES @@ -409,7 +409,7 @@ params: name: pPropValue desc: | [in,out][optional][typename(propName, propSize)] array of bytes of holding the program info property. - If propSize is not equal to or greater than the real number of bytes needed to return + If propSize is not equal to or greater than the real number of bytes needed to return the info then the $X_RESULT_ERROR_INVALID_SIZE error is returned and pPropValue is not used. - type: "size_t*" name: "pPropSizeRet" @@ -542,6 +542,11 @@ params: returns: - $X_RESULT_ERROR_INVALID_SIZE: - "`count == 0`" + - $X_RESULT_ERROR_INVALID_VALUE: + - "A pSpecConstant entry contains a size that does not match that of the specialization constant in the module." + - "A pSpecConstant entry contains a nullptr pValue." + - $X_RESULT_ERROR_INVALID_SPEC_ID: + - "Any id specified in a pSpecConstant entry is not a valid specialization constant identifier." --- #-------------------------------------------------------------------------- type: function desc: "Return program native program handle." diff --git a/scripts/core/queue.yml b/scripts/core/queue.yml index 263685d1aa..c8a6528fbd 100644 --- a/scripts/core/queue.yml +++ b/scripts/core/queue.yml @@ -29,7 +29,7 @@ etors: - name: REFERENCE_COUNT desc: | [uint32_t] Reference count of the queue object. - The reference count returned should be considered immediately stale. + The reference count returned should be considered immediately stale. It is unsuitable for general use in applications. This feature is provided for identifying memory leaks. - name: SIZE desc: | @@ -208,7 +208,7 @@ params: returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - - $X_RESULT_ERROR_OUT_OF_RESOURCES + - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- type: struct desc: "Descriptor for $xQueueGetNativeHandle and $xQueueCreateWithNativeHandle." diff --git a/scripts/core/registry.yml b/scripts/core/registry.yml index 45fbf5d161..f9f5c59f41 100644 --- a/scripts/core/registry.yml +++ b/scripts/core/registry.yml @@ -364,9 +364,6 @@ etors: - name: COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_EXP desc: Enumerator for $xCommandBufferAppendKernelLaunchExp value: '125' -- name: COMMAND_BUFFER_ENQUEUE_EXP - desc: Enumerator for $xCommandBufferEnqueueExp - value: '128' - name: USM_PITCHED_ALLOC_EXP desc: Enumerator for $xUSMPitchedAllocExp value: '132' @@ -403,9 +400,6 @@ etors: - name: BINDLESS_IMAGES_MAP_EXTERNAL_ARRAY_EXP desc: Enumerator for $xBindlessImagesMapExternalArrayExp value: '144' -- name: BINDLESS_IMAGES_RELEASE_INTEROP_EXP - desc: Enumerator for $xBindlessImagesReleaseInteropExp - value: '145' - name: BINDLESS_IMAGES_RELEASE_EXTERNAL_SEMAPHORE_EXP desc: Enumerator for $xBindlessImagesReleaseExternalSemaphoreExp value: '147' @@ -514,39 +508,6 @@ etors: - name: LOADER_TEAR_DOWN desc: Enumerator for $xLoaderTearDown value: '202' -- name: COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP - desc: Enumerator for $xCommandBufferAppendUSMMemcpyExp - value: '203' -- name: COMMAND_BUFFER_APPEND_USM_FILL_EXP - desc: Enumerator for $xCommandBufferAppendUSMFillExp - value: '204' -- name: COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_EXP - desc: Enumerator for $xCommandBufferAppendMemBufferCopyExp - value: '205' -- name: COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_EXP - desc: Enumerator for $xCommandBufferAppendMemBufferWriteExp - value: '206' -- name: COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_EXP - desc: Enumerator for $xCommandBufferAppendMemBufferReadExp - value: '207' -- name: COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_RECT_EXP - desc: Enumerator for $xCommandBufferAppendMemBufferCopyRectExp - value: '208' -- name: COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_RECT_EXP - desc: Enumerator for $xCommandBufferAppendMemBufferWriteRectExp - value: '209' -- name: COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_RECT_EXP - desc: Enumerator for $xCommandBufferAppendMemBufferReadRectExp - value: '210' -- name: COMMAND_BUFFER_APPEND_MEM_BUFFER_FILL_EXP - desc: Enumerator for $xCommandBufferAppendMemBufferFillExp - value: '211' -- name: COMMAND_BUFFER_APPEND_USM_PREFETCH_EXP - desc: Enumerator for $xCommandBufferAppendUSMPrefetchExp - value: '212' -- name: COMMAND_BUFFER_APPEND_USM_ADVISE_EXP - desc: Enumerator for $xCommandBufferAppendUSMAdviseExp - value: '213' - name: ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP desc: Enumerator for $xEnqueueCooperativeKernelLaunchExp value: '214' @@ -595,12 +556,60 @@ etors: - name: LOADER_CONFIG_SET_MOCKING_ENABLED desc: Enumerator for $xLoaderConfigSetMockingEnabled value: '229' +- name: BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP + desc: Enumerator for $xBindlessImagesReleaseExternalMemoryExp + value: '230' +- name: COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP + desc: Enumerator for $xCommandBufferAppendUSMMemcpyExp + value: '231' +- name: COMMAND_BUFFER_APPEND_USM_FILL_EXP + desc: Enumerator for $xCommandBufferAppendUSMFillExp + value: '232' +- name: COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_EXP + desc: Enumerator for $xCommandBufferAppendMemBufferCopyExp + value: '233' +- name: COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_EXP + desc: Enumerator for $xCommandBufferAppendMemBufferWriteExp + value: '234' +- name: COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_EXP + desc: Enumerator for $xCommandBufferAppendMemBufferReadExp + value: '235' +- name: COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_RECT_EXP + desc: Enumerator for $xCommandBufferAppendMemBufferCopyRectExp + value: '236' +- name: COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_RECT_EXP + desc: Enumerator for $xCommandBufferAppendMemBufferWriteRectExp + value: '237' +- name: COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_RECT_EXP + desc: Enumerator for $xCommandBufferAppendMemBufferReadRectExp + value: '238' +- name: COMMAND_BUFFER_APPEND_MEM_BUFFER_FILL_EXP + desc: Enumerator for $xCommandBufferAppendMemBufferFillExp + value: '239' +- name: COMMAND_BUFFER_APPEND_USM_PREFETCH_EXP + desc: Enumerator for $xCommandBufferAppendUSMPrefetchExp + value: '240' +- name: COMMAND_BUFFER_APPEND_USM_ADVISE_EXP + desc: Enumerator for $xCommandBufferAppendUSMAdviseExp + value: '241' +- name: COMMAND_BUFFER_ENQUEUE_EXP + desc: Enumerator for $xCommandBufferEnqueueExp + value: '242' +- name: COMMAND_BUFFER_UPDATE_SIGNAL_EVENT_EXP + desc: Enumerator for $xCommandBufferUpdateSignalEventExp + value: '243' +- name: COMMAND_BUFFER_UPDATE_WAIT_EVENTS_EXP + desc: Enumerator for $xCommandBufferUpdateWaitEventsExp + value: '244' +- name: BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP + desc: Enumerator for $xBindlessImagesMapExternalLinearMemoryExp + value: '245' - name: TENSOR_MAP_ENCODE_IM_2_COL_EXP desc: Enumerator for $xTensorMapEncodeIm2ColExp - value: '230' + value: '246' - name: TENSOR_MAP_ENCODE_TILED_EXP desc: Enumerator for $xTensorMapEncodeTiledExp - value: '231' + value: '247' --- type: enum desc: Defines structure types diff --git a/scripts/core/sampler.yml b/scripts/core/sampler.yml index 6459277c6f..7e555386b0 100644 --- a/scripts/core/sampler.yml +++ b/scripts/core/sampler.yml @@ -52,7 +52,7 @@ etors: - name: REFERENCE_COUNT desc: | [uint32_t] Reference count of the sampler object. - The reference count returned should be considered immediately stale. + The reference count returned should be considered immediately stale. It is unsuitable for general use in applications. This feature is provided for identifying memory leaks. - name: CONTEXT desc: "[$x_context_handle_t] Sampler context info" @@ -136,7 +136,7 @@ params: returns: - $X_RESULT_ERROR_INVALID_SAMPLER - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - - $X_RESULT_ERROR_OUT_OF_RESOURCES + - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- type: function desc: "Query information about a sampler object" diff --git a/scripts/core/usm.yml b/scripts/core/usm.yml index da5cd8c578..22f975a65d 100644 --- a/scripts/core/usm.yml +++ b/scripts/core/usm.yml @@ -154,7 +154,7 @@ members: Must be zero or a power of 2. Must be equal to or smaller than the size of the largest data type supported by `hDevice`. --- #-------------------------------------------------------------------------- -type: struct +type: struct desc: "USM host allocation descriptor type." details: - Specify these properties in $xUSMHostAlloc and $xUSMSharedAlloc via $x_usm_desc_t @@ -252,10 +252,10 @@ params: - type: void** name: ppMem desc: "[out] pointer to USM host memory object" -returns: +returns: - $X_RESULT_ERROR_INVALID_CONTEXT - $X_RESULT_ERROR_INVALID_OPERATION: - - "If $X_DEVICE_INFO_USM_HOST_SUPPORT is false." + - "If $X_DEVICE_INFO_USM_HOST_SUPPORT is false." - $X_RESULT_ERROR_INVALID_VALUE: - "`pUSMDesc && pUSMDesc->align != 0 && ((pUSMDesc->align & (pUSMDesc->align-1)) != 0)`" # alignment must be power of two - "If `align` is greater that the size of the largest data type supported by `hDevice`." @@ -297,7 +297,7 @@ params: - type: void** name: ppMem desc: "[out] pointer to USM device memory object" -returns: +returns: - $X_RESULT_ERROR_INVALID_CONTEXT - $X_RESULT_ERROR_INVALID_OPERATION: - "If $X_DEVICE_INFO_USM_HOST_SUPPORT is false." @@ -343,7 +343,7 @@ params: - type: void** name: ppMem desc: "[out] pointer to USM shared memory object" -returns: +returns: - $X_RESULT_ERROR_INVALID_CONTEXT - $X_RESULT_ERROR_INVALID_VALUE: - "`pUSMDesc && pUSMDesc->align != 0 && ((pUSMDesc->align & (pUSMDesc->align-1)) != 0)`" # alignment must be power of two @@ -462,7 +462,7 @@ etors: - name: REFERENCE_COUNT desc: | [uint32_t] Reference count of the pool object. - The reference count returned should be considered immediately stale. + The reference count returned should be considered immediately stale. It is unsuitable for general use in applications. This feature is provided for identifying memory leaks. - name: CONTEXT desc: "[$x_context_handle_t] USM memory pool context info" diff --git a/scripts/core/virtual_memory.yml b/scripts/core/virtual_memory.yml index 5b12e1761e..133266de64 100644 --- a/scripts/core/virtual_memory.yml +++ b/scripts/core/virtual_memory.yml @@ -220,7 +220,7 @@ params: name: size desc: "[in] size in bytes of the virtual memory range." - type: $x_virtual_mem_info_t - name: propName + name: propName desc: "[in] type of the info to query." - type: size_t name: propSize diff --git a/scripts/generate_code.py b/scripts/generate_code.py index b3a1146a3d..0c7476ab42 100644 --- a/scripts/generate_code.py +++ b/scripts/generate_code.py @@ -108,6 +108,26 @@ def _mako_print_cpp(path, namespace, tags, version, specs, meta): specs=specs, meta=meta) + +def _mako_api_funcs(path, namespace, tags, version, revision, specs, meta): + template = "api_funcs.def.mako" + fin = os.path.join(templates_dir, template) + + name = "%s_api_funcs"%(namespace) + filename = "%s.def"%(name) + fout = os.path.join(path, filename) + + print("Generating %s..."%fout) + return util.makoWrite( + fin, fout, + name=name, + ver=version, + rev=revision, + namespace=namespace, + tags=tags, + specs=specs, + meta=meta) + """ generates c/c++ files from the specification documents """ @@ -116,6 +136,7 @@ def _generate_api_cpp(incpath, srcpath, namespace, tags, version, revision, spec loc += _mako_api_cpp(srcpath, namespace, tags, version, revision, specs, meta) loc += _mako_ddi_h(incpath, namespace, tags, version, revision, specs, meta) loc += _mako_print_hpp(incpath, namespace, tags, version, revision, specs, meta) + loc += _mako_api_funcs(incpath, namespace, tags, version, revision, specs, meta) return loc @@ -379,6 +400,32 @@ def generate_loader(path, section, namespace, tags, version, specs, meta): ) print("Generated %s lines of code.\n"%loc) +""" + generates c/c++ files from the specification documents +""" +def _mako_interface_loader_api(path, adapter, ext, namespace, tags, version, specs, meta): + dstpath = os.path.join(path, adapter) + os.makedirs(dstpath, exist_ok=True) + + template = f"ur_interface_loader.{ext}.mako" + fin = os.path.join(templates_dir, template) + + name = f"ur_interface_loader" + + filename = f"{name}.{ext}" + fout = os.path.join(dstpath, filename) + + print("Generating %s..."%fout) + return util.makoWrite( + fin, fout, + name=name, + adapter=adapter, + ver=version, + namespace=namespace, + tags=tags, + specs=specs, + meta=meta,) + """ Entry-point: generates adapter for unified_runtime @@ -395,6 +442,10 @@ def generate_adapters(path, section, namespace, tags, version, specs, meta): loc += _mako_linker_scripts( dstpath, "adapter", "def", namespace, tags, version, specs, meta ) + + loc += _mako_interface_loader_api(dstpath, "level_zero", "cpp", namespace, tags, version, specs, meta) + loc += _mako_interface_loader_api(dstpath, "level_zero", "hpp", namespace, tags, version, specs, meta) + print("Generated %s lines of code.\n"%loc) """ @@ -465,7 +516,7 @@ def generate_level_zero_queue_api(path, section, namespace, tags, version, specs name = "queue_api" filename = "queue_api.cpp" - layer_dstpath = os.path.join(path, "adapters/level_zero") + layer_dstpath = os.path.join(path, "adapters", "level_zero", "v2") os.makedirs(layer_dstpath, exist_ok=True) fout = os.path.join(layer_dstpath, filename) diff --git a/scripts/parse_specs.py b/scripts/parse_specs.py index 345066357a..fe5cbe2027 100644 --- a/scripts/parse_specs.py +++ b/scripts/parse_specs.py @@ -21,8 +21,8 @@ from version import Version -default_version = Version("0.10") -all_versions = [Version(ver) for ver in ["0.6", "0.7", "0.8", "0.9", "0.10"]] +default_version = Version("0.11") +all_versions = [Version(ver) for ver in ["0.6", "0.7", "0.8", "0.9", "0.10", "0.11"]] """ preprocess object diff --git a/scripts/templates/api.h.mako b/scripts/templates/api.h.mako index 9fc9944b47..85b8a78c2a 100644 --- a/scripts/templates/api.h.mako +++ b/scripts/templates/api.h.mako @@ -42,7 +42,7 @@ extern "C" { %if len(spec['objects']): // ${th.subt(n, tags, spec['header']['desc'])} #if !defined(__GNUC__) -#pragma region ${spec['name']} +#pragma region ${spec['name'].replace(' ', '_')} #endif %endif %for obj in spec['objects']: diff --git a/scripts/templates/api_funcs.def.mako b/scripts/templates/api_funcs.def.mako new file mode 100644 index 0000000000..f0fb653208 --- /dev/null +++ b/scripts/templates/api_funcs.def.mako @@ -0,0 +1,35 @@ +<%! +import re +from templates import helper as th +%><% + n=namespace + N=n.upper() + + x=tags['$x'] + X=x.upper() +%> +/* + * + * Copyright (C) 2024 Intel Corporation + * + * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. + * See LICENSE.TXT + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * @file ${name}.def + * @version v${ver}-r${rev} + * + */ + + // Auto-generated file, do not edit. + +%for tbl in th.get_pfntables(specs, meta, n, tags): +%for obj in tbl['functions']: +_UR_API(${th.make_func_name(n, tags, obj)}) +%endfor +%endfor +%for obj in th.get_loader_functions(specs, meta, n, tags): +%if n + "Loader" in obj: +_UR_API(${obj}) +%endif +%endfor diff --git a/scripts/templates/ldrddi.cpp.mako b/scripts/templates/ldrddi.cpp.mako index 44631cc360..9c797a0ec3 100644 --- a/scripts/templates/ldrddi.cpp.mako +++ b/scripts/templates/ldrddi.cpp.mako @@ -365,6 +365,10 @@ ${tbl['export']['name']}( // Load the device-platform DDI tables for( auto& platform : ur_loader::getContext()->platforms ) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) + continue; + if(platform.initStatus != ${X}_RESULT_SUCCESS) continue; auto getTable = reinterpret_cast<${tbl['pfn']}>( diff --git a/scripts/templates/print.hpp.mako b/scripts/templates/print.hpp.mako index 9bf427b889..4180231ea4 100644 --- a/scripts/templates/print.hpp.mako +++ b/scripts/templates/print.hpp.mako @@ -411,6 +411,11 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct %endfor %endfor +inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const ur_bool_t value) { + os << (value ? "true" : "false"); + return os; +} + namespace ${x}::details { /////////////////////////////////////////////////////////////////////////////// // @brief Print pointer value diff --git a/scripts/templates/queue_api.cpp.mako b/scripts/templates/queue_api.cpp.mako index f941c7ba03..fcfa89d258 100644 --- a/scripts/templates/queue_api.cpp.mako +++ b/scripts/templates/queue_api.cpp.mako @@ -24,8 +24,9 @@ from templates import helper as th ur_queue_handle_t_::~ur_queue_handle_t_() {} ## FUNCTION ################################################################### +namespace ${x}::level_zero { %for obj in th.get_queue_related_functions(specs, n, tags): -${X}_APIEXPORT ${x}_result_t ${X}_APICALL +${x}_result_t ${th.make_func_name(n, tags, obj)}( %for line in th.make_param_lines(n, tags, obj, format=["name", "type", "delim"]): ${line} @@ -35,3 +36,4 @@ ${th.make_func_name(n, tags, obj)}( return ${obj['params'][0]['name']}->${th.transform_queue_related_function_name(n, tags, obj, format=["name"])}; } %endfor +} \ No newline at end of file diff --git a/scripts/templates/stype_map_helpers.hpp.mako b/scripts/templates/stype_map_helpers.hpp.mako index 26aff00cd5..a62691c2d0 100644 --- a/scripts/templates/stype_map_helpers.hpp.mako +++ b/scripts/templates/stype_map_helpers.hpp.mako @@ -1,4 +1,5 @@ <%! +import os import re from templates import helper as th %><% @@ -7,7 +8,7 @@ from templates import helper as th x=tags['$x'] X=x.upper() %> -// This file is autogenerated from the template at ${self.template.filename} +// This file is autogenerated from the template at ${os.path.dirname(self.template.filename)}/${os.path.basename(self.template.filename)} %for obj in th.extract_objs(specs, r"enum"): %if obj["name"] == '$x_structure_type_t': diff --git a/scripts/templates/trcddi.cpp.mako b/scripts/templates/trcddi.cpp.mako index 671179d3e2..9c676810b4 100644 --- a/scripts/templates/trcddi.cpp.mako +++ b/scripts/templates/trcddi.cpp.mako @@ -45,15 +45,18 @@ namespace ur_tracing_layer ${th.make_pfncb_param_type(n, tags, obj)} params = { &${",&".join(th.make_param_lines(n, tags, obj, format=["name"]))} }; uint64_t instance = getContext()->notify_begin(${th.make_func_etor(n, tags, obj)}, "${th.make_func_name(n, tags, obj)}", ¶ms); - getContext()->logger.info("---> ${th.make_func_name(n, tags, obj)}"); + auto &logger = getContext()->logger; + logger.info(" ---> ${th.make_func_name(n, tags, obj)}\n"); ${x}_result_t result = ${th.make_pfn_name(n, tags, obj)}( ${", ".join(th.make_param_lines(n, tags, obj, format=["name"]))} ); getContext()->notify_end(${th.make_func_etor(n, tags, obj)}, "${th.make_func_name(n, tags, obj)}", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, ${th.make_func_etor(n, tags, obj)}, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, ${th.make_func_etor(n, tags, obj)}, ¶ms); + logger.info(" <--- ${th.make_func_name(n, tags, obj)}({}) -> {};\n", args_str.str(), result); + } return result; } diff --git a/scripts/templates/ur_interface_loader.cpp.mako b/scripts/templates/ur_interface_loader.cpp.mako new file mode 100644 index 0000000000..3298b5bcae --- /dev/null +++ b/scripts/templates/ur_interface_loader.cpp.mako @@ -0,0 +1,88 @@ +<%! +import re +from templates import helper as th +%><% + n=namespace + N=n.upper() + + x=tags['$x'] + X=x.upper() + Adapter=adapter.upper() +%>//===--------- ${n}_interface_loader.cpp - Level Zero Adapter ------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include <${n}_api.h> +#include <${n}_ddi.h> + +#include "ur_interface_loader.hpp" + +static ur_result_t validateProcInputs(ur_api_version_t version, void *pDdiTable) { + if (nullptr == pDdiTable) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + // Pre 1.0 we enforce loader and adapter must have same version. + // Post 1.0 only major version match should be required. + if (version != UR_API_VERSION_CURRENT) { + return UR_RESULT_ERROR_UNSUPPORTED_VERSION; + } + return UR_RESULT_SUCCESS; +} + +#ifdef UR_STATIC_ADAPTER_${Adapter} +namespace ${n}::${adapter} { +#elif defined(__cplusplus) +extern "C" { +#endif + +%for tbl in th.get_pfntables(specs, meta, n, tags): +${X}_APIEXPORT ${x}_result_t ${X}_APICALL ${tbl['export']['name']}( + %for line in th.make_param_lines(n, tags, tbl['export'], format=["type", "name", "delim"]): + ${line} + %endfor + ) +{ + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + + %for obj in tbl['functions']: + pDdiTable->${th.append_ws(th.make_pfn_name(n, tags, obj), 43)} = ${n}::${adapter}::${th.make_func_name(n, tags, obj)}; + %endfor + + return result; +} + +%endfor + +#ifdef UR_STATIC_ADAPTER_${Adapter} +} // namespace ur::${adapter} +#elif defined(__cplusplus) +} // extern "C" +#endif + +#ifdef UR_STATIC_ADAPTER_${Adapter} +namespace ur::${adapter} { +ur_result_t urAdapterGetDdiTables(ur_dditable_t *ddi) { + if (ddi == nullptr) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + ur_result_t result; + +%for tbl in th.get_pfntables(specs, meta, n, tags): + result = ${n}::${adapter}::${tbl['export']['name']}( ${X}_API_VERSION_CURRENT, &ddi->${tbl['name']} ); + if (result != UR_RESULT_SUCCESS) + return result; +%endfor + + return result; +} +} +#endif diff --git a/scripts/templates/ur_interface_loader.hpp.mako b/scripts/templates/ur_interface_loader.hpp.mako new file mode 100644 index 0000000000..e2902f93c8 --- /dev/null +++ b/scripts/templates/ur_interface_loader.hpp.mako @@ -0,0 +1,38 @@ +<%! +import re +from templates import helper as th +%><% + n=namespace + N=n.upper() + + x=tags['$x'] + X=x.upper() + Adapter=adapter.upper() +%>//===--------- ${n}_interface_loader.hpp - Level Zero Adapter ------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include <${n}_api.h> +#include <${n}_ddi.h> + +namespace ${n}::${adapter} { +%for s in specs: +%for obj in th.filter_items(s['objects'], 'type', 'function'): +%if not th.obj_traits.is_loader_only(obj): +${x}_result_t ${th.make_func_name(n, tags, obj)}( + %for line in th.make_param_lines(n, tags, obj, format=["type", "name", "delim"]): + ${line} + %endfor + ); +%endif +%endfor +%endfor +#ifdef UR_STATIC_ADAPTER_LEVEL_ZERO +ur_result_t urAdapterGetDdiTables(ur_dditable_t *ddi); +#endif +} diff --git a/scripts/templates/valddi.cpp.mako b/scripts/templates/valddi.cpp.mako index 778595b052..8cc4a9dc0f 100644 --- a/scripts/templates/valddi.cpp.mako +++ b/scripts/templates/valddi.cpp.mako @@ -57,8 +57,16 @@ namespace ur_validation_layer { %for key, values in sorted_param_checks: %for val in values: - if( ${val} ) + %if 'boundsError' in val: + if ( getContext()->enableBoundsChecking ) { + if ( ${val} ) { + return ${key}; + } + } + %else: + if ( ${val} ) return ${key}; + %endif %endfor %endfor @@ -178,9 +186,13 @@ namespace ur_validation_layer if (enabledLayerNames.count(nameFullValidation)) { enableParameterValidation = true; + enableBoundsChecking = true; enableLeakChecking = true; enableLifetimeValidation = true; } else { + if (enabledLayerNames.count(nameBoundsChecking)) { + enableBoundsChecking = true; + } if (enabledLayerNames.count(nameParameterValidation)) { enableParameterValidation = true; } @@ -209,13 +221,11 @@ namespace ur_validation_layer } ${x}_result_t context_t::tearDown() { - ${x}_result_t result = ${X}_RESULT_SUCCESS; - if (enableLeakChecking) { getContext()->refCountContext->logInvalidReferences(); - getContext()->refCountContext->clear(); } - return result; + + return ${X}_RESULT_SUCCESS; } } // namespace ur_validation_layer diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt index e45f39fca8..f981c17dd5 100644 --- a/source/adapters/CMakeLists.txt +++ b/source/adapters/CMakeLists.txt @@ -46,7 +46,7 @@ function(add_ur_adapter_subdirectory name) endif() endfunction() -if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_ALL) +if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_L0_V2 OR UR_BUILD_ADAPTER_ALL) add_ur_adapter_subdirectory(level_zero) endif() diff --git a/source/adapters/cuda/CMakeLists.txt b/source/adapters/cuda/CMakeLists.txt index 77bebd1409..3d0418fd07 100644 --- a/source/adapters/cuda/CMakeLists.txt +++ b/source/adapters/cuda/CMakeLists.txt @@ -46,6 +46,7 @@ add_ur_adapter(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.hpp ) +install_ur_library(${TARGET_NAME}) set_target_properties(${TARGET_NAME} PROPERTIES VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}" @@ -99,8 +100,9 @@ if (UR_ENABLE_TRACING) endif() target_compile_definitions(${TARGET_NAME} PRIVATE XPTI_ENABLE_INSTRUMENTATION + XPTI_STATIC_LIBRARY ) - target_include_directories(${TARGET_NAME} PUBLIC + target_include_directories(${TARGET_NAME} PRIVATE ${XPTI_INCLUDES} ${CUDA_CUPTI_INCLUDE_DIR} ) @@ -115,6 +117,7 @@ endif() target_link_libraries(${TARGET_NAME} PRIVATE ${PROJECT_NAME}::headers ${PROJECT_NAME}::common + ${PROJECT_NAME}::umf Threads::Threads cudadrv ${EXTRA_LIBS} diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index 8d21a93c75..e47bcf9c2a 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -26,6 +26,14 @@ commandBufferReleaseInternal(ur_exp_command_buffer_handle_t CommandBuffer) { return UR_RESULT_SUCCESS; } + // Release the memory allocated to the CudaGraph + UR_CHECK_ERROR(cuGraphDestroy(CommandBuffer->CudaGraph)); + + // Release the memory allocated to the CudaGraphExec + if (CommandBuffer->CudaGraphExec) { + UR_CHECK_ERROR(cuGraphExecDestroy(CommandBuffer->CudaGraphExec)); + } + delete CommandBuffer; return UR_RESULT_SUCCESS; } @@ -39,6 +47,17 @@ commandHandleReleaseInternal(ur_exp_command_buffer_command_handle_t Command) { // Decrement parent command-buffer internal ref count commandBufferReleaseInternal(Command->CommandBuffer); + // We create the ur_event_t returned to the user for a signal node using + // `makeWithNative` which sets `HasOwnership` to false. Therefore destruction + // of the `ur_event_t` object doesn't free the underlying CuEvent_t object and + // we need to do it manually ourselves. + if (Command->SignalNode) { + CUevent SignalEvent; + UR_CHECK_ERROR( + cuGraphEventRecordNodeGetEvent(Command->SignalNode, &SignalEvent)); + UR_CHECK_ERROR(cuEventDestroy(SignalEvent)); + } + delete Command; return UR_RESULT_SUCCESS; } @@ -61,27 +80,45 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { // Release the device UR_TRACE(urDeviceRelease(Device)); +} - // Release the memory allocated to the CudaGraph - cuGraphDestroy(CudaGraph); +std::unique_ptr +ur_exp_command_buffer_handle_t_::addSignalNode(CUgraphNode DepNode, + CUgraphNode &SignalNode) { + CUevent Event; + UR_CHECK_ERROR(cuEventCreate(&Event, CU_EVENT_DEFAULT)); + UR_CHECK_ERROR( + cuGraphAddEventRecordNode(&SignalNode, CudaGraph, &DepNode, 1, Event)); - // Release the memory allocated to the CudaGraphExec - if (CudaGraphExec) { - cuGraphExecDestroy(CudaGraphExec); - } + return std::unique_ptr( + ur_event_handle_t_::makeWithNative(Context, Event)); } -ur_exp_command_buffer_command_handle_t_:: - ur_exp_command_buffer_command_handle_t_( - ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, - std::shared_ptr &&Node, CUDA_KERNEL_NODE_PARAMS Params, - uint32_t WorkDim, const size_t *GlobalWorkOffsetPtr, - const size_t *GlobalWorkSizePtr, const size_t *LocalWorkSizePtr) - : CommandBuffer(CommandBuffer), Kernel(Kernel), Node{std::move(Node)}, - Params(Params), WorkDim(WorkDim), RefCountInternal(1), - RefCountExternal(1) { - CommandBuffer->incrementInternalReferenceCount(); +ur_result_t ur_exp_command_buffer_handle_t_::addWaitNodes( + std::vector &DepsList, uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList) { + std::vector WaitNodes(NumEventsInWaitList); + for (uint32_t i = 0; i < NumEventsInWaitList; i++) { + CUevent Event = EventWaitList[i]->get(); + UR_CHECK_ERROR(cuGraphAddEventWaitNode( + &WaitNodes[i], CudaGraph, DepsList.data(), DepsList.size(), Event)); + } + // Set DepsLists as an output parameter for communicating the list of wait + // nodes created. + DepsList = WaitNodes; + return UR_RESULT_SUCCESS; +} +kernel_command_handle::kernel_command_handle( + ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, + CUgraphNode Node, CUDA_KERNEL_NODE_PARAMS Params, uint32_t WorkDim, + const size_t *GlobalWorkOffsetPtr, const size_t *GlobalWorkSizePtr, + const size_t *LocalWorkSizePtr, uint32_t NumKernelAlternatives, + ur_kernel_handle_t *KernelAlternatives, CUgraphNode SignalNode, + std::vector WaitNodes) + : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode, + WaitNodes), + Kernel(Kernel), Params(Params), WorkDim(WorkDim) { const size_t CopySize = sizeof(size_t) * WorkDim; std::memcpy(GlobalWorkOffset, GlobalWorkOffsetPtr, CopySize); std::memcpy(GlobalWorkSize, GlobalWorkSizePtr, CopySize); @@ -97,6 +134,22 @@ ur_exp_command_buffer_command_handle_t_:: std::memset(GlobalWorkOffset + WorkDim, 0, ZeroSize); std::memset(GlobalWorkSize + WorkDim, 0, ZeroSize); } + + /* Add the default Kernel as a valid kernel handle for this command */ + ValidKernelHandles.insert(Kernel); + if (KernelAlternatives) { + ValidKernelHandles.insert(KernelAlternatives, + KernelAlternatives + NumKernelAlternatives); + } +}; + +ur_exp_command_buffer_command_handle_t_:: + ur_exp_command_buffer_command_handle_t_( + ur_exp_command_buffer_handle_t CommandBuffer, CUgraphNode Node, + CUgraphNode SignalNode, std::vector WaitNodes) + : CommandBuffer(CommandBuffer), Node(Node), SignalNode(SignalNode), + WaitNodes(WaitNodes), RefCountInternal(1), RefCountExternal(1) { + CommandBuffer->incrementInternalReferenceCount(); } /// Helper function for finding the Cuda Nodes associated with the @@ -124,7 +177,7 @@ static ur_result_t getNodesFromSyncPoints( for (size_t i = 0; i < NumSyncPointsInWaitList; i++) { if (auto NodeHandle = SyncPoints.find(SyncPointWaitList[i]); NodeHandle != SyncPoints.end()) { - CuNodesList.push_back(*NodeHandle->second.get()); + CuNodesList.push_back(NodeHandle->second); } else { return UR_RESULT_ERROR_INVALID_VALUE; } @@ -155,28 +208,38 @@ static void setCopyParams(const void *SrcPtr, const CUmemorytype_enum SrcType, Params.Depth = 1; } -// Helper function for enqueuing memory fills +// Helper function for enqueuing memory fills. Templated on the CommandType +// enum class for the type of fill being created. +template static ur_result_t enqueueCommandBufferFillHelper( ur_exp_command_buffer_handle_t CommandBuffer, void *DstDevice, const CUmemorytype_enum DstType, const void *Pattern, size_t PatternSize, size_t Size, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - ur_exp_command_buffer_sync_point_t *SyncPoint) { - ur_result_t Result = UR_RESULT_SUCCESS; + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_exp_command_buffer_sync_point_t *RetSyncPoint, + ur_event_handle_t *RetEvent, + ur_exp_command_buffer_command_handle_t *RetCommand) { std::vector DepsList; - UR_CALL(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, - SyncPointWaitList, DepsList), - Result); + UR_CHECK_ERROR(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, + SyncPointWaitList, DepsList)); + + if (NumEventsInWaitList) { + UR_CHECK_ERROR(CommandBuffer->addWaitNodes(DepsList, NumEventsInWaitList, + EventWaitList)); + } try { + // Graph node added to graph, if multiple nodes are created this will + // be set to the leaf node + CUgraphNode GraphNode; + const size_t N = Size / PatternSize; auto DstPtr = DstType == CU_MEMORYTYPE_DEVICE ? *static_cast(DstDevice) : (CUdeviceptr)DstDevice; if ((PatternSize == 1) || (PatternSize == 2) || (PatternSize == 4)) { - // Create a new node - CUgraphNode GraphNode; CUDA_MEMSET_NODE_PARAMS NodeParams = {}; NodeParams.dst = DstPtr; NodeParams.elementSize = PatternSize; @@ -207,11 +270,6 @@ static ur_result_t enqueueCommandBufferFillHelper( cuGraphAddMemsetNode(&GraphNode, CommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), &NodeParams, CommandBuffer->Device->getNativeContext())); - - // Get sync point and register the cuNode with it. - *SyncPoint = - CommandBuffer->addSyncPoint(std::make_shared(GraphNode)); - } else { // CUDA has no memset functions that allow setting values more than 4 // bytes. UR API lets you pass an arbitrary "pattern" to the buffer @@ -222,10 +280,6 @@ static ur_result_t enqueueCommandBufferFillHelper( size_t NumberOfSteps = PatternSize / sizeof(uint8_t); - // Shared pointer that will point to the last node created - std::shared_ptr GraphNodePtr; - // Create a new node - CUgraphNode GraphNodeFirst; // Update NodeParam CUDA_MEMSET_NODE_PARAMS NodeParamsStepFirst = {}; NodeParamsStepFirst.dst = DstPtr; @@ -236,16 +290,12 @@ static ur_result_t enqueueCommandBufferFillHelper( NodeParamsStepFirst.width = 1; UR_CHECK_ERROR(cuGraphAddMemsetNode( - &GraphNodeFirst, CommandBuffer->CudaGraph, DepsList.data(), + &GraphNode, CommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), &NodeParamsStepFirst, CommandBuffer->Device->getNativeContext())); - // Get sync point and register the cuNode with it. - *SyncPoint = CommandBuffer->addSyncPoint( - std::make_shared(GraphNodeFirst)); - DepsList.clear(); - DepsList.push_back(GraphNodeFirst); + DepsList.push_back(GraphNode); // we walk up the pattern in 1-byte steps, and call cuMemset for each // 1-byte chunk of the pattern. @@ -256,8 +306,6 @@ static ur_result_t enqueueCommandBufferFillHelper( // offset the pointer to the part of the buffer we want to write to auto OffsetPtr = DstPtr + (Step * sizeof(uint8_t)); - // Create a new node - CUgraphNode GraphNode; // Update NodeParam CUDA_MEMSET_NODE_PARAMS NodeParamsStep = {}; NodeParamsStep.dst = (CUdeviceptr)OffsetPtr; @@ -272,18 +320,37 @@ static ur_result_t enqueueCommandBufferFillHelper( DepsList.size(), &NodeParamsStep, CommandBuffer->Device->getNativeContext())); - GraphNodePtr = std::make_shared(GraphNode); - // Get sync point and register the cuNode with it. - *SyncPoint = CommandBuffer->addSyncPoint(GraphNodePtr); - DepsList.clear(); - DepsList.push_back(*GraphNodePtr.get()); + DepsList.push_back(GraphNode); } } + + CUgraphNode SignalNode = nullptr; + if (RetEvent) { + auto SignalEvent = CommandBuffer->addSignalNode(GraphNode, SignalNode); + *RetEvent = SignalEvent.release(); + } + + // Get sync point and register the cuNode with it. + CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode; + auto SyncPoint = CommandBuffer->addSyncPoint(SyncPointNode); + if (RetSyncPoint) { + *RetSyncPoint = SyncPoint; + } + + std::vector WaitNodes = + NumEventsInWaitList ? DepsList : std::vector(); + auto NewCommand = new T(CommandBuffer, GraphNode, SignalNode, WaitNodes); + CommandBuffer->CommandHandles.push_back(NewCommand); + + if (RetCommand) { + NewCommand->incrementInternalReferenceCount(); + *RetCommand = NewCommand; + } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( @@ -358,60 +425,85 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numKernelAlternatives, ur_kernel_handle_t *phKernelAlternatives, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, ur_exp_command_buffer_command_handle_t *phCommand) { // Preconditions + // Command handles can only be obtained from updatable command-buffers + UR_ASSERT(!(phCommand && !hCommandBuffer->IsUpdatable), + UR_RESULT_ERROR_INVALID_OPERATION); UR_ASSERT(hCommandBuffer->Context == hKernel->getContext(), UR_RESULT_ERROR_INVALID_KERNEL); UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - ur_result_t Result = UR_RESULT_SUCCESS; - CUgraphNode GraphNode; + for (uint32_t i = 0; i < numKernelAlternatives; ++i) { + UR_ASSERT(phKernelAlternatives[i] != hKernel, + UR_RESULT_ERROR_INVALID_VALUE); + } - std::vector DepsList; + try { + CUgraphNode GraphNode; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); + std::vector DepsList; + UR_CHECK_ERROR(getNodesFromSyncPoints( + hCommandBuffer, numSyncPointsInWaitList, pSyncPointWaitList, DepsList)); - if (Result != UR_RESULT_SUCCESS) { - return Result; - } + if (numEventsInWaitList) { + UR_CHECK_ERROR(hCommandBuffer->addWaitNodes(DepsList, numEventsInWaitList, + phEventWaitList)); + } - if (*pGlobalWorkSize == 0) { - try { + if (*pGlobalWorkSize == 0) { // Create an empty node if the kernel workload size is zero - UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph, - DepsList.data(), DepsList.size())); + if (!phEvent) { + UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, + hCommandBuffer->CudaGraph, + DepsList.data(), DepsList.size())); + } else { + CUevent Event = nullptr; + UR_CHECK_ERROR(cuEventCreate(&Event, CU_EVENT_DEFAULT)); + UR_CHECK_ERROR( + cuGraphAddEventRecordNode(&GraphNode, hCommandBuffer->CudaGraph, + DepsList.data(), DepsList.size(), Event)); + + auto RetEventUP = std::unique_ptr( + ur_event_handle_t_::makeWithNative(hCommandBuffer->Context, Event)); + + *phEvent = RetEventUP.release(); + } + + // Add signal node if external return event is used. + CUgraphNode SignalNode = nullptr; + if (phEvent) { + auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode); + *phEvent = SignalEvent.release(); + } // Get sync point and register the cuNode with it. - *pSyncPoint = hCommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); - } catch (ur_result_t Err) { - Result = Err; + CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode; + auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } + return UR_RESULT_SUCCESS; } - return Result; - } - // Set the number of threads per block to the number of threads per warp - // by default unless user has provided a better number - size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; - size_t BlocksPerGrid[3] = {1u, 1u, 1u}; + // Set the number of threads per block to the number of threads per warp + // by default unless user has provided a better number + size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; + size_t BlocksPerGrid[3] = {1u, 1u, 1u}; - uint32_t LocalSize = hKernel->getLocalSize(); - CUfunction CuFunc = hKernel->get(); - Result = - setKernelParams(hCommandBuffer->Context, hCommandBuffer->Device, workDim, - pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid); - if (Result != UR_RESULT_SUCCESS) { - return Result; - } + uint32_t LocalSize = hKernel->getLocalSize(); + CUfunction CuFunc = hKernel->get(); + UR_CHECK_ERROR(setKernelParams( + hCommandBuffer->Context, hCommandBuffer->Device, workDim, + pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, hKernel, CuFunc, + ThreadsPerBlock, BlocksPerGrid)); - try { // Set node param structure with the kernel related data auto &ArgIndices = hKernel->getArgIndices(); CUDA_KERNEL_NODE_PARAMS NodeParams = {}; @@ -433,43 +525,53 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( if (LocalSize != 0) hKernel->clearLocalSize(); + // Add signal node if external return event is used. + CUgraphNode SignalNode = nullptr; + if (phEvent) { + auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode); + *phEvent = SignalEvent.release(); + } + // Get sync point and register the cuNode with it. - auto NodeSP = std::make_shared(GraphNode); + CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode; + auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode); if (pSyncPoint) { - *pSyncPoint = hCommandBuffer->addSyncPoint(NodeSP); + *pSyncPoint = SyncPoint; } - auto NewCommand = new ur_exp_command_buffer_command_handle_t_{ - hCommandBuffer, hKernel, std::move(NodeSP), NodeParams, - workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize}; - - NewCommand->incrementInternalReferenceCount(); + std::vector WaitNodes = + numEventsInWaitList ? DepsList : std::vector(); + auto NewCommand = new kernel_command_handle( + hCommandBuffer, hKernel, GraphNode, NodeParams, workDim, + pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numKernelAlternatives, phKernelAlternatives, SignalNode, WaitNodes); hCommandBuffer->CommandHandles.push_back(NewCommand); if (phCommand) { + NewCommand->incrementInternalReferenceCount(); *phCommand = NewCommand; } - } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( ur_exp_command_buffer_handle_t hCommandBuffer, void *pDst, const void *pSrc, size_t size, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - ur_result_t Result = UR_RESULT_SUCCESS; + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { CUgraphNode GraphNode; std::vector DepsList; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); - if (Result != UR_RESULT_SUCCESS) { - return Result; + if (numEventsInWaitList) { + UR_CHECK_ERROR(hCommandBuffer->addWaitNodes(DepsList, numEventsInWaitList, + phEventWaitList)); } try { @@ -481,13 +583,34 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), &NodeParams, hCommandBuffer->Device->getNativeContext())); + // Add signal node if external return event is used. + CUgraphNode SignalNode = nullptr; + if (phEvent) { + auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode); + *phEvent = SignalEvent.release(); + } + // Get sync point and register the cuNode with it. - *pSyncPoint = - hCommandBuffer->addSyncPoint(std::make_shared(GraphNode)); + CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode; + auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } + + std::vector WaitNodes = + numEventsInWaitList ? DepsList : std::vector(); + auto NewCommand = new usm_memcpy_command_handle(hCommandBuffer, GraphNode, + SignalNode, WaitNodes); + hCommandBuffer->CommandHandles.push_back(NewCommand); + + if (phCommand) { + NewCommand->incrementInternalReferenceCount(); + *phCommand = NewCommand; + } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( @@ -495,8 +618,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( ur_mem_handle_t hDstMem, size_t srcOffset, size_t dstOffset, size_t size, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - ur_result_t Result = UR_RESULT_SUCCESS; + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { CUgraphNode GraphNode; std::vector DepsList; @@ -505,12 +629,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( UR_ASSERT(size + srcOffset <= std::get(hSrcMem->Mem).getSize(), UR_RESULT_ERROR_INVALID_SIZE); - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); - if (Result != UR_RESULT_SUCCESS) { - return Result; + if (numEventsInWaitList) { + UR_CHECK_ERROR(hCommandBuffer->addWaitNodes(DepsList, numEventsInWaitList, + phEventWaitList)); } try { @@ -527,13 +651,33 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), &NodeParams, hCommandBuffer->Device->getNativeContext())); + // Add signal node if external return event is used. + CUgraphNode SignalNode = nullptr; + if (phEvent) { + auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode); + *phEvent = SignalEvent.release(); + } + // Get sync point and register the cuNode with it. - *pSyncPoint = - hCommandBuffer->addSyncPoint(std::make_shared(GraphNode)); + CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode; + auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } + + std::vector WaitNodes = + numEventsInWaitList ? DepsList : std::vector(); + auto NewCommand = new buffer_copy_command_handle(hCommandBuffer, GraphNode, + SignalNode, WaitNodes); + + if (phCommand) { + NewCommand->incrementInternalReferenceCount(); + *phCommand = NewCommand; + } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( @@ -543,16 +687,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - ur_result_t Result = UR_RESULT_SUCCESS; + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { CUgraphNode GraphNode; std::vector DepsList; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); - if (Result != UR_RESULT_SUCCESS) { - return Result; + if (numEventsInWaitList) { + UR_CHECK_ERROR(hCommandBuffer->addWaitNodes(DepsList, numEventsInWaitList, + phEventWaitList)); } try { @@ -570,13 +715,34 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), &NodeParams, hCommandBuffer->Device->getNativeContext())); + // Add signal node if external return event is used. + CUgraphNode SignalNode = nullptr; + if (phEvent) { + auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode); + *phEvent = SignalEvent.release(); + } + // Get sync point and register the cuNode with it. - *pSyncPoint = - hCommandBuffer->addSyncPoint(std::make_shared(GraphNode)); + CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode; + auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } + + std::vector WaitNodes = + numEventsInWaitList ? DepsList : std::vector(); + auto NewCommand = new buffer_copy_rect_command_handle( + hCommandBuffer, GraphNode, SignalNode, WaitNodes); + hCommandBuffer->CommandHandles.push_back(NewCommand); + + if (phCommand) { + NewCommand->incrementInternalReferenceCount(); + *phCommand = NewCommand; + } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT @@ -585,16 +751,17 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( size_t offset, size_t size, const void *pSrc, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - ur_result_t Result = UR_RESULT_SUCCESS; + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { CUgraphNode GraphNode; std::vector DepsList; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); - if (Result != UR_RESULT_SUCCESS) { - return Result; + if (numEventsInWaitList) { + UR_CHECK_ERROR(hCommandBuffer->addWaitNodes(DepsList, numEventsInWaitList, + phEventWaitList)); } try { @@ -609,13 +776,34 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), &NodeParams, hCommandBuffer->Device->getNativeContext())); + // Add signal node if external return event is used. + CUgraphNode SignalNode = nullptr; + if (phEvent) { + auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode); + *phEvent = SignalEvent.release(); + } + // Get sync point and register the cuNode with it. - *pSyncPoint = - hCommandBuffer->addSyncPoint(std::make_shared(GraphNode)); + CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode; + auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } + + std::vector WaitNodes = + numEventsInWaitList ? DepsList : std::vector(); + auto NewCommand = new buffer_write_command_handle(hCommandBuffer, GraphNode, + SignalNode, WaitNodes); + hCommandBuffer->CommandHandles.push_back(NewCommand); + + if (phCommand) { + NewCommand->incrementInternalReferenceCount(); + *phCommand = NewCommand; + } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT @@ -623,16 +811,17 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, size_t offset, size_t size, void *pDst, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - ur_result_t Result = UR_RESULT_SUCCESS; + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { CUgraphNode GraphNode; std::vector DepsList; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); - if (Result != UR_RESULT_SUCCESS) { - return Result; + if (numEventsInWaitList) { + UR_CHECK_ERROR(hCommandBuffer->addWaitNodes(DepsList, numEventsInWaitList, + phEventWaitList)); } try { @@ -647,13 +836,34 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), &NodeParams, hCommandBuffer->Device->getNativeContext())); + // Add signal node if external return event is used. + CUgraphNode SignalNode = nullptr; + if (phEvent) { + auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode); + *phEvent = SignalEvent.release(); + } + // Get sync point and register the cuNode with it. - *pSyncPoint = - hCommandBuffer->addSyncPoint(std::make_shared(GraphNode)); + CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode; + auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } + + std::vector WaitNodes = + numEventsInWaitList ? DepsList : std::vector(); + auto NewCommand = new buffer_read_command_handle(hCommandBuffer, GraphNode, + SignalNode, WaitNodes); + hCommandBuffer->CommandHandles.push_back(NewCommand); + + if (phCommand) { + NewCommand->incrementInternalReferenceCount(); + *phCommand = NewCommand; + } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT @@ -664,16 +874,17 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - ur_result_t Result = UR_RESULT_SUCCESS; + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { CUgraphNode GraphNode; std::vector DepsList; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); - if (Result != UR_RESULT_SUCCESS) { - return Result; + if (numEventsInWaitList) { + UR_CHECK_ERROR(hCommandBuffer->addWaitNodes(DepsList, numEventsInWaitList, + phEventWaitList)); } try { @@ -690,13 +901,34 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), &NodeParams, hCommandBuffer->Device->getNativeContext())); + // Add signal node if external return event is used. + CUgraphNode SignalNode = nullptr; + if (phEvent) { + auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode); + *phEvent = SignalEvent.release(); + } + // Get sync point and register the cuNode with it. - *pSyncPoint = - hCommandBuffer->addSyncPoint(std::make_shared(GraphNode)); + CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode; + auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } + + std::vector WaitNodes = + numEventsInWaitList ? DepsList : std::vector(); + auto NewCommand = new buffer_write_rect_command_handle( + hCommandBuffer, GraphNode, SignalNode, WaitNodes); + hCommandBuffer->CommandHandles.push_back(NewCommand); + + if (phCommand) { + NewCommand->incrementInternalReferenceCount(); + *phCommand = NewCommand; + } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT @@ -707,16 +939,17 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( size_t hostRowPitch, size_t hostSlicePitch, void *pDst, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - ur_result_t Result = UR_RESULT_SUCCESS; + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { CUgraphNode GraphNode; std::vector DepsList; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); - if (Result != UR_RESULT_SUCCESS) { - return Result; + if (numEventsInWaitList) { + UR_CHECK_ERROR(hCommandBuffer->addWaitNodes(DepsList, numEventsInWaitList, + phEventWaitList)); } try { @@ -733,13 +966,34 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), &NodeParams, hCommandBuffer->Device->getNativeContext())); + // Add signal node if external return event is used. + CUgraphNode SignalNode = nullptr; + if (phEvent) { + auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode); + *phEvent = SignalEvent.release(); + } + // Get sync point and register the cuNode with it. - *pSyncPoint = - hCommandBuffer->addSyncPoint(std::make_shared(GraphNode)); + CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode; + auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } + + std::vector WaitNodes = + numEventsInWaitList ? DepsList : std::vector(); + auto NewCommand = new buffer_read_rect_command_handle( + hCommandBuffer, GraphNode, SignalNode, WaitNodes); + hCommandBuffer->CommandHandles.push_back(NewCommand); + + if (phCommand) { + NewCommand->incrementInternalReferenceCount(); + *phCommand = NewCommand; + } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( @@ -747,34 +1001,55 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( size_t /*Size*/, ur_usm_migration_flags_t /*Flags*/, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { // Prefetch cmd is not supported by Cuda Graph. // We implement it as an empty node to enforce dependencies. - ur_result_t Result = UR_RESULT_SUCCESS; CUgraphNode GraphNode; std::vector DepsList; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); + + if (numEventsInWaitList) { + UR_CHECK_ERROR(hCommandBuffer->addWaitNodes(DepsList, numEventsInWaitList, + phEventWaitList)); + } try { // Add an empty node to preserve dependencies. UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size())); + // Add signal node if external return event is used. + CUgraphNode SignalNode = nullptr; + if (phEvent) { + auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode); + *phEvent = SignalEvent.release(); + } + // Get sync point and register the cuNode with it. - *pSyncPoint = - hCommandBuffer->addSyncPoint(std::make_shared(GraphNode)); + CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode; + auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } - setErrorMessage("Prefetch hint ignored and replaced with empty node as " - "prefetch is not supported by CUDA Graph backend", - UR_RESULT_SUCCESS); - Result = UR_RESULT_ERROR_ADAPTER_SPECIFIC; + std::vector WaitNodes = + numEventsInWaitList ? DepsList : std::vector(); + auto NewCommand = new usm_prefetch_command_handle(hCommandBuffer, GraphNode, + SignalNode, WaitNodes); + hCommandBuffer->CommandHandles.push_back(NewCommand); + + if (phCommand) { + NewCommand->incrementInternalReferenceCount(); + *phCommand = NewCommand; + } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( @@ -782,35 +1057,56 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( size_t /*Size*/, ur_usm_advice_flags_t /*Advice*/, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { // Mem-Advise cmd is not supported by Cuda Graph. // We implement it as an empty node to enforce dependencies. - ur_result_t Result = UR_RESULT_SUCCESS; CUgraphNode GraphNode; std::vector DepsList; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); + + if (numEventsInWaitList) { + UR_CHECK_ERROR(hCommandBuffer->addWaitNodes(DepsList, numEventsInWaitList, + phEventWaitList)); + } try { // Add an empty node to preserve dependencies. UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size())); + // Add signal node if external return event is used. + CUgraphNode SignalNode = nullptr; + if (phEvent) { + auto SignalEvent = hCommandBuffer->addSignalNode(GraphNode, SignalNode); + *phEvent = SignalEvent.release(); + } + // Get sync point and register the cuNode with it. - *pSyncPoint = - hCommandBuffer->addSyncPoint(std::make_shared(GraphNode)); + CUgraphNode SyncPointNode = SignalNode ? SignalNode : GraphNode; + auto SyncPoint = hCommandBuffer->addSyncPoint(SyncPointNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } - setErrorMessage("Memory advice ignored and replaced with empty node as " - "memory advice is not supported by CUDA Graph backend", - UR_RESULT_SUCCESS); - Result = UR_RESULT_ERROR_ADAPTER_SPECIFIC; + std::vector WaitNodes = + numEventsInWaitList ? DepsList : std::vector(); + auto NewCommand = new usm_advise_command_handle(hCommandBuffer, GraphNode, + SignalNode, WaitNodes); + hCommandBuffer->CommandHandles.push_back(NewCommand); + + if (phCommand) { + NewCommand->incrementInternalReferenceCount(); + *phCommand = NewCommand; + } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( @@ -818,7 +1114,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( const void *pPattern, size_t patternSize, size_t offset, size_t size, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { auto ArgsAreMultiplesOfPatternSize = (offset % patternSize == 0) || (size % patternSize == 0); @@ -833,9 +1131,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( auto DstDevice = std::get(hBuffer->Mem) .getPtrWithOffset(hCommandBuffer->Device, offset); - return enqueueCommandBufferFillHelper( + return enqueueCommandBufferFillHelper( hCommandBuffer, &DstDevice, CU_MEMORYTYPE_DEVICE, pPattern, patternSize, - size, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + size, numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( @@ -843,24 +1142,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( const void *pPattern, size_t patternSize, size_t size, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { auto PatternIsValid = (pPattern != nullptr); auto PatternSizeIsValid = ((patternSize & (patternSize - 1)) == 0) && (patternSize > 0); // is a positive power of two UR_ASSERT(PatternIsValid && PatternSizeIsValid, UR_RESULT_ERROR_INVALID_SIZE); - return enqueueCommandBufferFillHelper( + return enqueueCommandBufferFillHelper( hCommandBuffer, pPtr, CU_MEMORYTYPE_UNIFIED, pPattern, patternSize, size, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - ur_result_t Result = UR_RESULT_SUCCESS; try { std::unique_ptr RetImplEvent{nullptr}; @@ -870,10 +1170,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( CUstream CuStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); - if ((Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, - phEventWaitList)) != UR_RESULT_SUCCESS) { - return Result; - } + UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList)); if (phEvent) { RetImplEvent = std::unique_ptr( @@ -890,10 +1188,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( *phEvent = RetImplEvent.release(); } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferRetainCommandExp( @@ -909,51 +1207,53 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseCommandExp( return commandHandleReleaseInternal(hCommand); } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( - ur_exp_command_buffer_command_handle_t hCommand, - const ur_exp_command_buffer_update_kernel_launch_desc_t - *pUpdateKernelLaunch) { - // Update requires command-buffer to be finalized - ur_exp_command_buffer_handle_t CommandBuffer = hCommand->CommandBuffer; - if (!CommandBuffer->CudaGraphExec) { +/** + * Validates contents of the update command description. + * @param[in] Command The command which is being updated. + * @param[in] UpdateCommandDesc The update command description. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t +validateCommandDesc(kernel_command_handle *Command, + const ur_exp_command_buffer_update_kernel_launch_desc_t + *UpdateCommandDesc) { + auto CommandBuffer = Command->CommandBuffer; + // Update requires the command-buffer to be finalized and updatable. + if (!CommandBuffer->CudaGraphExec || !CommandBuffer->IsUpdatable) { return UR_RESULT_ERROR_INVALID_OPERATION; } - // Update requires command-buffer to be created with update enabled - if (!CommandBuffer->IsUpdatable) { - return UR_RESULT_ERROR_INVALID_OPERATION; + if (UpdateCommandDesc->newWorkDim != Command->WorkDim && + (!UpdateCommandDesc->pNewGlobalWorkOffset || + !UpdateCommandDesc->pNewGlobalWorkSize)) { + return UR_RESULT_ERROR_INVALID_VALUE; } - if (auto NewWorkDim = pUpdateKernelLaunch->newWorkDim) { - // Error if work dim changes - if (NewWorkDim != hCommand->WorkDim) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } - - // Error If Local size and not global size - if ((pUpdateKernelLaunch->pNewLocalWorkSize != nullptr) && - (pUpdateKernelLaunch->pNewGlobalWorkSize == nullptr)) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } - - // Error if local size non-nullptr and created with null - // or if local size nullptr and created with non-null - const bool IsNewLocalSizeNull = - pUpdateKernelLaunch->pNewLocalWorkSize == nullptr; - const bool IsOriginalLocalSizeNull = hCommand->isNullLocalSize(); - - if (IsNewLocalSizeNull ^ IsOriginalLocalSizeNull) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } + if (UpdateCommandDesc->hNewKernel && + !Command->ValidKernelHandles.count(UpdateCommandDesc->hNewKernel)) { + return UR_RESULT_ERROR_INVALID_VALUE; } + return UR_RESULT_SUCCESS; +} + +/** + * Updates the arguments of CommandDesc->hNewKernel + * @param[in] Device The device associated with the kernel being updated. + * @param[in] UpdateCommandDesc The update command description that contains the + * new kernel and its arguments. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t +updateKernelArguments(ur_device_handle_t Device, + const ur_exp_command_buffer_update_kernel_launch_desc_t + *UpdateCommandDesc) { - // Kernel corresponding to the command to update - ur_kernel_handle_t Kernel = hCommand->Kernel; + ur_kernel_handle_t NewKernel = UpdateCommandDesc->hNewKernel; // Update pointer arguments to the kernel - uint32_t NumPointerArgs = pUpdateKernelLaunch->numNewPointerArgs; + uint32_t NumPointerArgs = UpdateCommandDesc->numNewPointerArgs; const ur_exp_command_buffer_update_pointer_arg_desc_t *ArgPointerList = - pUpdateKernelLaunch->pNewPointerArgList; + UpdateCommandDesc->pNewPointerArgList; for (uint32_t i = 0; i < NumPointerArgs; i++) { const auto &PointerArgDesc = ArgPointerList[i]; uint32_t ArgIndex = PointerArgDesc.argIndex; @@ -961,7 +1261,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( ur_result_t Result = UR_RESULT_SUCCESS; try { - Kernel->setKernelArg(ArgIndex, sizeof(ArgValue), ArgValue); + NewKernel->setKernelArg(ArgIndex, sizeof(ArgValue), ArgValue); } catch (ur_result_t Err) { Result = Err; return Result; @@ -969,9 +1269,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( } // Update memobj arguments to the kernel - uint32_t NumMemobjArgs = pUpdateKernelLaunch->numNewMemObjArgs; + uint32_t NumMemobjArgs = UpdateCommandDesc->numNewMemObjArgs; const ur_exp_command_buffer_update_memobj_arg_desc_t *ArgMemobjList = - pUpdateKernelLaunch->pNewMemObjArgList; + UpdateCommandDesc->pNewMemObjArgList; for (uint32_t i = 0; i < NumMemobjArgs; i++) { const auto &MemobjArgDesc = ArgMemobjList[i]; uint32_t ArgIndex = MemobjArgDesc.argIndex; @@ -980,11 +1280,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( ur_result_t Result = UR_RESULT_SUCCESS; try { if (ArgValue == nullptr) { - Kernel->setKernelArg(ArgIndex, 0, nullptr); + NewKernel->setKernelArg(ArgIndex, 0, nullptr); } else { - CUdeviceptr CuPtr = - std::get(ArgValue->Mem).getPtr(CommandBuffer->Device); - Kernel->setKernelArg(ArgIndex, sizeof(CUdeviceptr), (void *)&CuPtr); + CUdeviceptr CuPtr = std::get(ArgValue->Mem).getPtr(Device); + NewKernel->setKernelArg(ArgIndex, sizeof(CUdeviceptr), (void *)&CuPtr); } } catch (ur_result_t Err) { Result = Err; @@ -993,9 +1292,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( } // Update value arguments to the kernel - uint32_t NumValueArgs = pUpdateKernelLaunch->numNewValueArgs; + uint32_t NumValueArgs = UpdateCommandDesc->numNewValueArgs; const ur_exp_command_buffer_update_value_arg_desc_t *ArgValueList = - pUpdateKernelLaunch->pNewValueArgList; + UpdateCommandDesc->pNewValueArgList; for (uint32_t i = 0; i < NumValueArgs; i++) { const auto &ValueArgDesc = ArgValueList[i]; uint32_t ArgIndex = ValueArgDesc.argIndex; @@ -1003,59 +1302,93 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( const void *ArgValue = ValueArgDesc.pNewValueArg; ur_result_t Result = UR_RESULT_SUCCESS; - try { - Kernel->setKernelArg(ArgIndex, ArgSize, ArgValue); + NewKernel->setKernelArg(ArgIndex, ArgSize, ArgValue); } catch (ur_result_t Err) { Result = Err; return Result; } } - // Set the updated ND range - const uint32_t NewWorkDim = pUpdateKernelLaunch->newWorkDim; - if (NewWorkDim != 0) { - UR_ASSERT(NewWorkDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - UR_ASSERT(NewWorkDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - hCommand->WorkDim = NewWorkDim; + return UR_RESULT_SUCCESS; +} + +/** + * Updates the command buffer command with new values from the update + * description. + * @param[in] Command The command to be updated. + * @param[in] UpdateCommandDesc The update command description. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t +updateCommand(kernel_command_handle *Command, + const ur_exp_command_buffer_update_kernel_launch_desc_t + *UpdateCommandDesc) { + if (UpdateCommandDesc->hNewKernel) { + Command->Kernel = UpdateCommandDesc->hNewKernel; } - if (pUpdateKernelLaunch->pNewGlobalWorkOffset) { - hCommand->setGlobalOffset(pUpdateKernelLaunch->pNewGlobalWorkOffset); + if (UpdateCommandDesc->newWorkDim) { + Command->WorkDim = UpdateCommandDesc->newWorkDim; } - if (pUpdateKernelLaunch->pNewGlobalWorkSize) { - hCommand->setGlobalSize(pUpdateKernelLaunch->pNewGlobalWorkSize); + if (UpdateCommandDesc->pNewGlobalWorkOffset) { + Command->setGlobalOffset(UpdateCommandDesc->pNewGlobalWorkOffset); } - if (pUpdateKernelLaunch->pNewLocalWorkSize) { - hCommand->setLocalSize(pUpdateKernelLaunch->pNewLocalWorkSize); + if (UpdateCommandDesc->pNewGlobalWorkSize) { + Command->setGlobalSize(UpdateCommandDesc->pNewGlobalWorkSize); + if (!UpdateCommandDesc->pNewLocalWorkSize) { + Command->setNullLocalSize(); + } } - size_t *GlobalWorkOffset = hCommand->GlobalWorkOffset; - size_t *GlobalWorkSize = hCommand->GlobalWorkSize; + if (UpdateCommandDesc->pNewLocalWorkSize) { + Command->setLocalSize(UpdateCommandDesc->pNewLocalWorkSize); + } - // If no worksize is provided make sure we pass nullptr to setKernelParams so + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( + ur_exp_command_buffer_command_handle_t hCommand, + const ur_exp_command_buffer_update_kernel_launch_desc_t + *pUpdateKernelLaunch) { + + ur_exp_command_buffer_handle_t CommandBuffer = hCommand->CommandBuffer; + + if (hCommand->getCommandType() != CommandType::Kernel) { + return UR_RESULT_ERROR_INVALID_VALUE; + } + + auto KernelCommandHandle = static_cast(hCommand); + + UR_CHECK_ERROR(validateCommandDesc(KernelCommandHandle, pUpdateKernelLaunch)); + UR_CHECK_ERROR( + updateKernelArguments(CommandBuffer->Device, pUpdateKernelLaunch)); + UR_CHECK_ERROR(updateCommand(KernelCommandHandle, pUpdateKernelLaunch)); + + // If no work-size is provided make sure we pass nullptr to setKernelParams so // it can guess the local work size. - const bool ProvidedLocalSize = !hCommand->isNullLocalSize(); - size_t *LocalWorkSize = ProvidedLocalSize ? hCommand->LocalWorkSize : nullptr; - uint32_t WorkDim = hCommand->WorkDim; + const bool ProvidedLocalSize = !KernelCommandHandle->isNullLocalSize(); + size_t *LocalWorkSize = + ProvidedLocalSize ? KernelCommandHandle->LocalWorkSize : nullptr; // Set the number of threads per block to the number of threads per warp - // by default unless user has provided a better number + // by default unless user has provided a better number. size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; size_t BlocksPerGrid[3] = {1u, 1u, 1u}; - CUfunction CuFunc = Kernel->get(); - ur_context_handle_t Context = CommandBuffer->Context; - ur_device_handle_t Device = CommandBuffer->Device; - auto Result = setKernelParams(Context, Device, WorkDim, GlobalWorkOffset, - GlobalWorkSize, LocalWorkSize, Kernel, CuFunc, - ThreadsPerBlock, BlocksPerGrid); + CUfunction CuFunc = KernelCommandHandle->Kernel->get(); + auto Result = setKernelParams( + CommandBuffer->Context, CommandBuffer->Device, + KernelCommandHandle->WorkDim, KernelCommandHandle->GlobalWorkOffset, + KernelCommandHandle->GlobalWorkSize, LocalWorkSize, + KernelCommandHandle->Kernel, CuFunc, ThreadsPerBlock, BlocksPerGrid); if (Result != UR_RESULT_SUCCESS) { return Result; } - CUDA_KERNEL_NODE_PARAMS &Params = hCommand->Params; + CUDA_KERNEL_NODE_PARAMS &Params = KernelCommandHandle->Params; Params.func = CuFunc; Params.gridDimX = BlocksPerGrid[0]; @@ -1064,15 +1397,82 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( Params.blockDimX = ThreadsPerBlock[0]; Params.blockDimY = ThreadsPerBlock[1]; Params.blockDimZ = ThreadsPerBlock[2]; - Params.sharedMemBytes = Kernel->getLocalSize(); - Params.kernelParams = const_cast(Kernel->getArgIndices().data()); + Params.sharedMemBytes = KernelCommandHandle->Kernel->getLocalSize(); + Params.kernelParams = + const_cast(KernelCommandHandle->Kernel->getArgIndices().data()); - CUgraphNode Node = *(hCommand->Node); + CUgraphNode Node = KernelCommandHandle->Node; CUgraphExec CudaGraphExec = CommandBuffer->CudaGraphExec; UR_CHECK_ERROR(cuGraphExecKernelNodeSetParams(CudaGraphExec, Node, &Params)); return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateSignalEventExp( + ur_exp_command_buffer_command_handle_t hCommand, + ur_event_handle_t *phEvent) { + ur_exp_command_buffer_handle_t CommandBuffer = hCommand->CommandBuffer; + + // Update requires command-buffer to be finalized + if (!CommandBuffer->CudaGraphExec) { + return UR_RESULT_ERROR_INVALID_OPERATION; + } + + // Update requires command-buffer to be created with update enabled + if (!CommandBuffer->IsUpdatable) { + return UR_RESULT_ERROR_INVALID_OPERATION; + } + + // Error to try to update the signal event, when a signal event wasn't set on + // creation + CUgraphNode SignalNode = hCommand->SignalNode; + if (phEvent != nullptr && SignalNode == nullptr) { + return UR_RESULT_ERROR_INVALID_OPERATION; + } + + CUevent SignalEvent; + UR_CHECK_ERROR(cuGraphEventRecordNodeGetEvent(SignalNode, &SignalEvent)); + + if (phEvent) { + *phEvent = std::unique_ptr( + ur_event_handle_t_::makeWithNative(CommandBuffer->Context, + SignalEvent)) + .release(); + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateWaitEventsExp( + ur_exp_command_buffer_command_handle_t hCommand, + uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList) { + ur_exp_command_buffer_handle_t CommandBuffer = hCommand->CommandBuffer; + + // Update requires command-buffer to be finalized + if (!CommandBuffer->CudaGraphExec) { + return UR_RESULT_ERROR_INVALID_OPERATION; + } + + // Update requires command-buffer to be created with update enabled + if (!CommandBuffer->IsUpdatable) { + return UR_RESULT_ERROR_INVALID_OPERATION; + } + + // Error if number of wait nodes is not the same as when node was created + std::vector &WaitNodes = hCommand->WaitNodes; + if (NumEventsInWaitList != WaitNodes.size()) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + CUgraphExec CudaGraphExec = CommandBuffer->CudaGraphExec; + for (uint32_t i = 0; i < NumEventsInWaitList; i++) { + ur_event_handle_t WaitEvent = phEventWaitList[i]; + UR_CHECK_ERROR(cuGraphExecEventWaitNodeSetEvent(CudaGraphExec, WaitNodes[i], + WaitEvent->get())); + } + + return UR_RESULT_SUCCESS; +} + UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferGetInfoExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_exp_command_buffer_info_t propName, size_t propSize, void *pPropValue, @@ -1082,6 +1482,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferGetInfoExp( switch (propName) { case UR_EXP_COMMAND_BUFFER_INFO_REFERENCE_COUNT: return ReturnValue(hCommandBuffer->getExternalReferenceCount()); + case UR_EXP_COMMAND_BUFFER_INFO_DESCRIPTOR: { + ur_exp_command_buffer_desc_t Descriptor{}; + Descriptor.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC; + Descriptor.pNext = nullptr; + Descriptor.isUpdatable = hCommandBuffer->IsUpdatable; + Descriptor.isInOrder = false; + Descriptor.enableProfiling = false; + + return ReturnValue(Descriptor); + } default: assert(!"Command-buffer info request not implemented"); } diff --git a/source/adapters/cuda/command_buffer.hpp b/source/adapters/cuda/command_buffer.hpp index d83269f2ae..c82409104f 100644 --- a/source/adapters/cuda/command_buffer.hpp +++ b/source/adapters/cuda/command_buffer.hpp @@ -16,6 +16,7 @@ #include "logger/ur_logger.hpp" #include #include +#include // Trace an internal UR call #define UR_TRACE(Call) \ @@ -34,17 +35,78 @@ logger::always("UR <--- {}({})", #Call, Result); \ } -// Handle to a kernel command. -// -// Struct that stores all the information related to a kernel command in a -// command-buffer, such that the command can be recreated. When handles can -// be returned from other command types this struct will need refactored. +enum class CommandType { + Kernel, + USMMemcpy, + USMFill, + MemBufferCopy, + MemBufferCopyRect, + MemBufferRead, + MemBufferReadRect, + MemBufferWrite, + MemBufferWriteRect, + MemBufferFill, + USMPrefetch, + USMAdvise +}; + +// Command handle that can be returned from command append entry-points. +// Implemented as an abstract base class that handles for the specific +// command types derive from. struct ur_exp_command_buffer_command_handle_t_ { ur_exp_command_buffer_command_handle_t_( + ur_exp_command_buffer_handle_t CommandBuffer, CUgraphNode Node, + CUgraphNode SignalNode, std::vector WaitNodes); + + virtual ~ur_exp_command_buffer_command_handle_t_() {} + + virtual CommandType getCommandType() const noexcept = 0; + + uint32_t incrementInternalReferenceCount() noexcept { + return ++RefCountInternal; + } + uint32_t decrementInternalReferenceCount() noexcept { + return --RefCountInternal; + } + + uint32_t incrementExternalReferenceCount() noexcept { + return ++RefCountExternal; + } + uint32_t decrementExternalReferenceCount() noexcept { + return --RefCountExternal; + } + uint32_t getExternalReferenceCount() const noexcept { + return RefCountExternal; + } + + // Parent UR command-buffer. + ur_exp_command_buffer_handle_t CommandBuffer; + // Node created in graph for the command. + CUgraphNode Node; + // An optional EventRecordNode that's a successor of Node to signal + // dependent commands outwith the command-buffer. + CUgraphNode SignalNode; + // Optional list of EventWait Nodes to wait on commands from outside of the + // command-buffer. + std::vector WaitNodes; + +private: + std::atomic_uint32_t RefCountInternal; + std::atomic_uint32_t RefCountExternal; +}; + +struct kernel_command_handle : ur_exp_command_buffer_command_handle_t_ { + kernel_command_handle( ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, - std::shared_ptr &&Node, CUDA_KERNEL_NODE_PARAMS Params, - uint32_t WorkDim, const size_t *GlobalWorkOffsetPtr, - const size_t *GlobalWorkSizePtr, const size_t *LocalWorkSizePtr); + CUgraphNode Node, CUDA_KERNEL_NODE_PARAMS Params, uint32_t WorkDim, + const size_t *GlobalWorkOffsetPtr, const size_t *GlobalWorkSizePtr, + const size_t *LocalWorkSizePtr, uint32_t NumKernelAlternatives, + ur_kernel_handle_t *KernelAlternatives, CUgraphNode SignalNode, + std::vector WaitNodes); + + CommandType getCommandType() const noexcept override { + return CommandType::Kernel; + } void setGlobalOffset(const size_t *GlobalWorkOffsetPtr) { const size_t CopySize = sizeof(size_t) * WorkDim; @@ -73,41 +135,151 @@ struct ur_exp_command_buffer_command_handle_t_ { } } + void setNullLocalSize() noexcept { + std::memset(LocalWorkSize, 0, sizeof(size_t) * 3); + } + bool isNullLocalSize() const noexcept { const size_t Zeros[3] = {0, 0, 0}; return 0 == std::memcmp(LocalWorkSize, Zeros, sizeof(LocalWorkSize)); } - uint32_t incrementInternalReferenceCount() noexcept { - return ++RefCountInternal; - } - uint32_t decrementInternalReferenceCount() noexcept { - return --RefCountInternal; - } + // The currently active kernel handle for this command. + ur_kernel_handle_t Kernel; - uint32_t incrementExternalReferenceCount() noexcept { - return ++RefCountExternal; - } - uint32_t decrementExternalReferenceCount() noexcept { - return --RefCountExternal; - } - uint32_t getExternalReferenceCount() const noexcept { - return RefCountExternal; - } + // Set of all the kernel handles that can be used when updating this command. + std::unordered_set ValidKernelHandles; - ur_exp_command_buffer_handle_t CommandBuffer; - ur_kernel_handle_t Kernel; - std::shared_ptr Node; CUDA_KERNEL_NODE_PARAMS Params; uint32_t WorkDim; size_t GlobalWorkOffset[3]; size_t GlobalWorkSize[3]; size_t LocalWorkSize[3]; +}; -private: - std::atomic_uint32_t RefCountInternal; - std::atomic_uint32_t RefCountExternal; +struct usm_memcpy_command_handle : ur_exp_command_buffer_command_handle_t_ { + usm_memcpy_command_handle(ur_exp_command_buffer_handle_t CommandBuffer, + CUgraphNode Node, CUgraphNode SignalNode, + std::vector WaitNodes) + : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode, + WaitNodes) {} + CommandType getCommandType() const noexcept override { + return CommandType::USMMemcpy; + } +}; + +struct usm_fill_command_handle : ur_exp_command_buffer_command_handle_t_ { + usm_fill_command_handle(ur_exp_command_buffer_handle_t CommandBuffer, + CUgraphNode Node, CUgraphNode SignalNode, + std::vector WaitNodes) + : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode, + WaitNodes) {} + CommandType getCommandType() const noexcept override { + return CommandType::USMFill; + } +}; + +struct buffer_copy_command_handle : ur_exp_command_buffer_command_handle_t_ { + buffer_copy_command_handle(ur_exp_command_buffer_handle_t CommandBuffer, + CUgraphNode Node, CUgraphNode SignalNode, + std::vector WaitNodes) + : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode, + WaitNodes) {} + CommandType getCommandType() const noexcept override { + return CommandType::MemBufferCopy; + } +}; + +struct buffer_copy_rect_command_handle + : ur_exp_command_buffer_command_handle_t_ { + buffer_copy_rect_command_handle(ur_exp_command_buffer_handle_t CommandBuffer, + CUgraphNode Node, CUgraphNode SignalNode, + std::vector WaitNodes) + : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode, + WaitNodes) {} + CommandType getCommandType() const noexcept override { + return CommandType::MemBufferCopyRect; + } +}; + +struct buffer_read_command_handle : ur_exp_command_buffer_command_handle_t_ { + buffer_read_command_handle(ur_exp_command_buffer_handle_t CommandBuffer, + CUgraphNode Node, CUgraphNode SignalNode, + std::vector WaitNodes) + : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode, + WaitNodes) {} + CommandType getCommandType() const noexcept override { + return CommandType::MemBufferRead; + } +}; + +struct buffer_read_rect_command_handle + : ur_exp_command_buffer_command_handle_t_ { + buffer_read_rect_command_handle(ur_exp_command_buffer_handle_t CommandBuffer, + CUgraphNode Node, CUgraphNode SignalNode, + std::vector WaitNodes) + : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode, + WaitNodes) {} + CommandType getCommandType() const noexcept override { + return CommandType::MemBufferReadRect; + } +}; + +struct buffer_write_command_handle : ur_exp_command_buffer_command_handle_t_ { + buffer_write_command_handle(ur_exp_command_buffer_handle_t CommandBuffer, + CUgraphNode Node, CUgraphNode SignalNode, + std::vector WaitNodes) + : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode, + WaitNodes) {} + CommandType getCommandType() const noexcept override { + return CommandType::MemBufferWrite; + } +}; + +struct buffer_write_rect_command_handle + : ur_exp_command_buffer_command_handle_t_ { + buffer_write_rect_command_handle(ur_exp_command_buffer_handle_t CommandBuffer, + CUgraphNode Node, CUgraphNode SignalNode, + std::vector WaitNodes) + : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode, + WaitNodes) {} + CommandType getCommandType() const noexcept override { + return CommandType::MemBufferWriteRect; + } +}; + +struct buffer_fill_command_handle : ur_exp_command_buffer_command_handle_t_ { + buffer_fill_command_handle(ur_exp_command_buffer_handle_t CommandBuffer, + CUgraphNode Node, CUgraphNode SignalNode, + std::vector WaitNodes) + : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode, + WaitNodes) {} + CommandType getCommandType() const noexcept override { + return CommandType::MemBufferFill; + } +}; + +struct usm_prefetch_command_handle : ur_exp_command_buffer_command_handle_t_ { + usm_prefetch_command_handle(ur_exp_command_buffer_handle_t CommandBuffer, + CUgraphNode Node, CUgraphNode SignalNode, + std::vector WaitNodes) + : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode, + WaitNodes) {} + CommandType getCommandType() const noexcept override { + return CommandType::USMPrefetch; + } +}; + +struct usm_advise_command_handle : ur_exp_command_buffer_command_handle_t_ { + usm_advise_command_handle(ur_exp_command_buffer_handle_t CommandBuffer, + CUgraphNode Node, CUgraphNode SignalNode, + std::vector WaitNodes) + : ur_exp_command_buffer_command_handle_t_(CommandBuffer, Node, SignalNode, + WaitNodes) {} + CommandType getCommandType() const noexcept override { + return CommandType::USMAdvise; + } }; struct ur_exp_command_buffer_handle_t_ { @@ -115,11 +287,11 @@ struct ur_exp_command_buffer_handle_t_ { ur_exp_command_buffer_handle_t_(ur_context_handle_t Context, ur_device_handle_t Device, bool IsUpdatable); - ~ur_exp_command_buffer_handle_t_(); + virtual ~ur_exp_command_buffer_handle_t_(); void registerSyncPoint(ur_exp_command_buffer_sync_point_t SyncPoint, - std::shared_ptr CuNode) { - SyncPoints[SyncPoint] = std::move(CuNode); + CUgraphNode CuNode) { + SyncPoints[SyncPoint] = CuNode; NextSyncPoint++; } @@ -127,11 +299,28 @@ struct ur_exp_command_buffer_handle_t_ { return NextSyncPoint; } + // Creates a cuEvent object and adds a cuGraphAddEventRecordNode node to the + // graph. + // @param[in] DepNode Node for the EventRecord node to depend on. + // @param[out] SignalNode Node created by cuGraphAddEventRecordNode. + // @return UR event backed by CuEvent object that will be recorded to. + std::unique_ptr addSignalNode(CUgraphNode DepNode, + CUgraphNode &SignalNode); + + // Adds a cuGraphAddEventWaitNodes node to the graph + // @param[in,out] Dependencies for each of the wait nodes created. Set to the + // list of wait nodes created on success. + // @param[in] NumEventsInWaitList Number of wait nodes to create. + // @param[in] UR events wrapping the cuEvent objects the nodes will wait on. + // @returns UR_RESULT_SUCCESS or an error + ur_result_t addWaitNodes(std::vector &DepsList, + uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList); + // Helper to register next sync point // @param CuNode Node to register as next sync point // @return Pointer to the sync that registers the Node - ur_exp_command_buffer_sync_point_t - addSyncPoint(std::shared_ptr CuNode) { + ur_exp_command_buffer_sync_point_t addSyncPoint(CUgraphNode CuNode) { ur_exp_command_buffer_sync_point_t SyncPoint = NextSyncPoint; registerSyncPoint(SyncPoint, std::move(CuNode)); return SyncPoint; @@ -173,8 +362,7 @@ struct ur_exp_command_buffer_handle_t_ { std::atomic_uint32_t RefCountExternal; // Map of sync_points to ur_events - std::unordered_map> + std::unordered_map SyncPoints; // Next sync_point value (may need to consider ways to reuse values if 32-bits // is not enough) diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp index ea134c53a4..ea7c4da8ec 100644 --- a/source/adapters/cuda/device.cpp +++ b/source/adapters/cuda/device.cpp @@ -57,12 +57,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(4318u); } case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: { - int ComputeUnits = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &ComputeUnits, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, - hDevice->get())); - detail::ur::assertion(ComputeUnits >= 0); - return ReturnValue(static_cast(ComputeUnits)); + return ReturnValue(hDevice->getNumComputeUnits()); } case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: { return ReturnValue(MaxWorkItemDimensions); @@ -545,19 +540,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(size_t(1000)); } case UR_DEVICE_INFO_ENDIAN_LITTLE: { - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_AVAILABLE: { - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE: { - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_COMPILER_AVAILABLE: { - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_LINKER_AVAILABLE: { - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_EXECUTION_CAPABILITIES: { auto Capability = ur_device_exec_capability_flags_t{ @@ -621,7 +616,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_EXTENSIONS: { std::string SupportedExtensions = "cl_khr_fp64 cl_khr_subgroups "; - SupportedExtensions += "pi_ext_intel_devicelib_assert "; + SupportedExtensions += "cl_intel_devicelib_assert "; // Return supported for the UR command-buffer experimental feature SupportedExtensions += "ur_exp_command_buffer "; SupportedExtensions += "ur_exp_usm_p2p "; @@ -647,7 +642,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(size_t(1024)); } case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: { - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_PARENT_DEVICE: { return ReturnValue(nullptr); @@ -839,20 +834,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } case UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP: { // On CUDA bindless images are supported. - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_BINDLESS_IMAGES_SHARED_USM_SUPPORT_EXP: { // On CUDA bindless images can be backed by shared (managed) USM. - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_BINDLESS_IMAGES_1D_USM_SUPPORT_EXP: { // On CUDA 1D bindless image USM is supported, but sampling is not. // More specifically, linear filtering is not supported. - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_BINDLESS_IMAGES_2D_USM_SUPPORT_EXP: { // On CUDA 2D bindless image USM is supported. - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_IMAGE_PITCH_ALIGN_EXP: { int32_t tex_pitch_align = 0; @@ -884,11 +879,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } case UR_DEVICE_INFO_MIPMAP_SUPPORT_EXP: { // CUDA supports mipmaps. - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_MIPMAP_ANISOTROPY_SUPPORT_EXP: { // CUDA supports anisotropic filtering. - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_MIPMAP_MAX_ANISOTROPY_EXP: { // CUDA has no query for this, but documentation states max value is 16. @@ -896,68 +891,68 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } case UR_DEVICE_INFO_MIPMAP_LEVEL_REFERENCE_SUPPORT_EXP: { // CUDA supports creation of images from individual mipmap levels. - return ReturnValue(true); + return ReturnValue(static_cast(true)); } - case UR_DEVICE_INFO_INTEROP_MEMORY_IMPORT_SUPPORT_EXP: { + case UR_DEVICE_INFO_EXTERNAL_MEMORY_IMPORT_SUPPORT_EXP: { // CUDA supports importing external memory. - return ReturnValue(true); + return ReturnValue(static_cast(true)); } - case UR_DEVICE_INFO_INTEROP_SEMAPHORE_IMPORT_SUPPORT_EXP: { + case UR_DEVICE_INFO_EXTERNAL_SEMAPHORE_IMPORT_SUPPORT_EXP: { // CUDA supports importing external semaphores. - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_CUBEMAP_SUPPORT_EXP: { // CUDA supports cubemaps. - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP: { // CUDA supports cubemap seamless filtering. - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP: { // CUDA does support fetching 1D USM sampled image data. - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP: { // CUDA does not support fetching 1D non-USM sampled image data. - return ReturnValue(false); + return ReturnValue(static_cast(false)); } case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP: { // CUDA does support fetching 2D USM sampled image data. - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP: { // CUDA does support fetching 2D non-USM sampled image data. - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP: { // CUDA does support fetching 3D non-USM sampled image data. - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_IMAGE_ARRAY_SUPPORT_EXP: { // CUDA does support image arrays - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_BINDLESS_UNIQUE_ADDRESSING_PER_DIM_EXP: { // CUDA does support unique addressing per dimension - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_BINDLESS_SAMPLE_1D_USM_EXP: { // CUDA does not support sampling 1D USM sampled image data. - return ReturnValue(false); + return ReturnValue(static_cast(false)); } case UR_DEVICE_INFO_BINDLESS_SAMPLE_2D_USM_EXP: { // CUDA does support sampling 1D USM sampled image data. - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: { // CUDA supports recording timestamp events. - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP: { // CUDA supports enqueueing native work through the urNativeEnqueueExp - return ReturnValue(true); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_DEVICE_ID: { int Value = 0; @@ -1057,9 +1052,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(static_cast(MaxRegisters)); } case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT: - return ReturnValue(false); + return ReturnValue(static_cast(false)); case UR_DEVICE_INFO_IMAGE_SRGB: - return ReturnValue(false); + return ReturnValue(static_cast(false)); case UR_DEVICE_INFO_PCI_ADDRESS: { constexpr size_t AddressBufferSize = 13; char AddressBuffer[AddressBufferSize]; @@ -1071,16 +1066,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, strnlen(AddressBuffer, AddressBufferSize - 1) + 1); } case UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS: - return ReturnValue(false); + return ReturnValue(static_cast(false)); // TODO: Investigate if this information is available on CUDA. case UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED: - return ReturnValue(false); + return ReturnValue(static_cast(false)); case UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT: - return ReturnValue(true); + return ReturnValue(static_cast(true)); case UR_DEVICE_INFO_ESIMD_SUPPORT: - return ReturnValue(false); + return ReturnValue(static_cast(false)); case UR_DEVICE_INFO_GLOBAL_VARIABLE_SUPPORT: - return ReturnValue(true); + return ReturnValue(static_cast(true)); case UR_DEVICE_INFO_COMPONENT_DEVICES: case UR_DEVICE_INFO_COMPOSITE_DEVICE: case UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS: @@ -1093,8 +1088,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: - case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: + case UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP: return ReturnValue(true); + case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP: { + ur_device_command_buffer_update_capability_flags_t UpdateCapabilities = + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_EVENTS; + return ReturnValue(UpdateCapabilities); + } case UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP: { int Value = getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 9; @@ -1185,27 +1190,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( /// \return TBD UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( - ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform, - const ur_device_native_properties_t *pProperties, + ur_native_handle_t hNativeDevice, + [[maybe_unused]] ur_adapter_handle_t hAdapter, + [[maybe_unused]] const ur_device_native_properties_t *pProperties, ur_device_handle_t *phDevice) { - std::ignore = pProperties; - CUdevice CuDevice = static_cast(hNativeDevice); auto IsDevice = [=](std::unique_ptr &Dev) { return Dev->get() == CuDevice; }; - // If a platform is provided just check if the device is in it - if (hPlatform) { - auto SearchRes = std::find_if(begin(hPlatform->Devices), - end(hPlatform->Devices), IsDevice); - if (SearchRes != end(hPlatform->Devices)) { - *phDevice = SearchRes->get(); - return UR_RESULT_SUCCESS; - } - } - // Get list of platforms uint32_t NumPlatforms = 0; ur_adapter_handle_t AdapterHandle = &adapter; diff --git a/source/adapters/cuda/device.hpp b/source/adapters/cuda/device.hpp index 0a40329026..3654f2bb36 100644 --- a/source/adapters/cuda/device.hpp +++ b/source/adapters/cuda/device.hpp @@ -32,6 +32,7 @@ struct ur_device_handle_t_ { int MaxCapacityLocalMem{0}; int MaxChosenLocalMem{0}; bool MaxLocalMemSizeChosen{false}; + uint32_t NumComputeUnits{0}; public: ur_device_handle_t_(native_type cuDevice, CUcontext cuContext, CUevent evBase, @@ -54,6 +55,10 @@ struct ur_device_handle_t_ { sizeof(MaxWorkGroupSize), &MaxWorkGroupSize, nullptr)); + UR_CHECK_ERROR(cuDeviceGetAttribute( + reinterpret_cast(&NumComputeUnits), + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuDevice)); + // Set local mem max size if env var is present static const char *LocalMemSizePtrUR = std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE"); @@ -107,6 +112,8 @@ struct ur_device_handle_t_ { int getMaxChosenLocalMem() const noexcept { return MaxChosenLocalMem; }; bool maxLocalMemSizeChosen() { return MaxLocalMemSizeChosen; }; + + uint32_t getNumComputeUnits() const noexcept { return NumComputeUnits; }; }; int getAttribute(ur_device_handle_t Device, CUdevice_attribute Attribute); diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 1c074025a9..0e00f680f6 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -203,6 +203,7 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context, // Set the active context here as guessLocalWorkSize needs an active context ScopedContext Active(Device); { + size_t *MaxThreadsPerBlock = Kernel->MaxThreadsPerBlock; size_t *ReqdThreadsPerBlock = Kernel->ReqdThreadsPerBlock; MaxWorkGroupSize = Device->getMaxWorkGroupSize(); @@ -212,6 +213,10 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context, LocalWorkSize[Dim] != ReqdThreadsPerBlock[Dim]) return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + if (MaxThreadsPerBlock[Dim] != 0 && + LocalWorkSize[Dim] > MaxThreadsPerBlock[Dim]) + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + if (LocalWorkSize[Dim] > Device->getMaxWorkItemSizes(Dim)) return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; // Checks that local work sizes are a divisor of the global work sizes @@ -235,6 +240,12 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context, KernelLocalWorkGroupSize *= LocalWorkSize[Dim]; } + if (size_t MaxLinearThreadsPerBlock = Kernel->MaxLinearThreadsPerBlock; + MaxLinearThreadsPerBlock && + MaxLinearThreadsPerBlock < KernelLocalWorkGroupSize) { + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + if (hasExceededMaxRegistersPerBlock(Device, Kernel, KernelLocalWorkGroupSize)) { return UR_RESULT_ERROR_OUT_OF_RESOURCES; @@ -666,6 +677,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( } return UR_RESULT_SUCCESS; #else + [[maybe_unused]] auto _ = launchPropList; setErrorMessage("This feature requires cuda 11.8 or later.", UR_RESULT_ERROR_ADAPTER_SPECIFIC); return UR_RESULT_ERROR_ADAPTER_SPECIFIC; diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp index 9889031f1b..a6c2208e8f 100644 --- a/source/adapters/cuda/event.cpp +++ b/source/adapters/cuda/event.cpp @@ -164,8 +164,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent, UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); switch (propName) { - case UR_EVENT_INFO_COMMAND_QUEUE: + case UR_EVENT_INFO_COMMAND_QUEUE: { + // If the runtime owns the native handle, we have reference to the queue. + // Otherwise, the event handle comes from an interop API with no RT refs. + if (!hEvent->getQueue()) { + setErrorMessage("Command queue info cannot be queried for the event. The " + "event object was created from a native event and has no " + "valid reference to a command queue.", + UR_RESULT_ERROR_INVALID_VALUE); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } return ReturnValue(hEvent->getQueue()); + } case UR_EVENT_INFO_COMMAND_TYPE: return ReturnValue(hEvent->getCommandType()); case UR_EVENT_INFO_REFERENCE_COUNT: diff --git a/source/adapters/cuda/image.cpp b/source/adapters/cuda/image.cpp index 66a28a820b..4840553cc1 100644 --- a/source/adapters/cuda/image.cpp +++ b/source/adapters/cuda/image.cpp @@ -65,101 +65,59 @@ ur_result_t urCalculateNumChannels(ur_image_channel_order_t order, /// format if not nullptr. /// /param return_pixel_size_bytes will be set to the pixel /// byte size if not nullptr. +/// /param return_normalized_dtype_flag will be set if the +/// data type is normalized if not nullptr. ur_result_t urToCudaImageChannelFormat(ur_image_channel_type_t image_channel_type, ur_image_channel_order_t image_channel_order, CUarray_format *return_cuda_format, - size_t *return_pixel_size_bytes) { + size_t *return_pixel_size_bytes, + unsigned int *return_normalized_dtype_flag) { - CUarray_format cuda_format; + CUarray_format cuda_format = CU_AD_FORMAT_UNSIGNED_INT8; size_t pixel_size_bytes = 0; unsigned int num_channels = 0; + unsigned int normalized_dtype_flag = 0; UR_CHECK_ERROR(urCalculateNumChannels(image_channel_order, &num_channels)); switch (image_channel_type) { -#define CASE(FROM, TO, SIZE) \ +#define CASE(FROM, TO, SIZE, NORM) \ case FROM: { \ cuda_format = TO; \ pixel_size_bytes = SIZE * num_channels; \ + normalized_dtype_flag = NORM; \ break; \ } - CASE(UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8, CU_AD_FORMAT_UNSIGNED_INT8, 1) - CASE(UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8, CU_AD_FORMAT_SIGNED_INT8, 1) - CASE(UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16, CU_AD_FORMAT_UNSIGNED_INT16, 2) - CASE(UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16, CU_AD_FORMAT_SIGNED_INT16, 2) - CASE(UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT, CU_AD_FORMAT_HALF, 2) - CASE(UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32, CU_AD_FORMAT_UNSIGNED_INT32, 4) - CASE(UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32, CU_AD_FORMAT_SIGNED_INT32, 4) - CASE(UR_IMAGE_CHANNEL_TYPE_FLOAT, CU_AD_FORMAT_FLOAT, 4) + CASE(UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8, CU_AD_FORMAT_UNSIGNED_INT8, 1, 0) + CASE(UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8, CU_AD_FORMAT_SIGNED_INT8, 1, 0) + CASE(UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16, CU_AD_FORMAT_UNSIGNED_INT16, 2, + 0) + CASE(UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16, CU_AD_FORMAT_SIGNED_INT16, 2, 0) + CASE(UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT, CU_AD_FORMAT_HALF, 2, 0) + CASE(UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32, CU_AD_FORMAT_UNSIGNED_INT32, 4, + 0) + CASE(UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32, CU_AD_FORMAT_SIGNED_INT32, 4, 0) + CASE(UR_IMAGE_CHANNEL_TYPE_FLOAT, CU_AD_FORMAT_FLOAT, 4, 0) + CASE(UR_IMAGE_CHANNEL_TYPE_UNORM_INT8, CU_AD_FORMAT_UNSIGNED_INT8, 1, 1) + CASE(UR_IMAGE_CHANNEL_TYPE_SNORM_INT8, CU_AD_FORMAT_SIGNED_INT8, 1, 1) + CASE(UR_IMAGE_CHANNEL_TYPE_UNORM_INT16, CU_AD_FORMAT_UNSIGNED_INT16, 2, 1) + CASE(UR_IMAGE_CHANNEL_TYPE_SNORM_INT16, CU_AD_FORMAT_SIGNED_INT16, 2, 1) #undef CASE default: break; } - // These new formats were brought in in CUDA 11.5 -#if CUDA_VERSION >= 11050 - - // If none of the above channel types were passed, check those below - if (pixel_size_bytes == 0) { - - // We can't use a switch statement here because these single - // UR_IMAGE_CHANNEL_TYPEs can correspond to multiple [u/s]norm CU_AD_FORMATs - // depending on the number of channels. We use a std::map instead to - // retrieve the correct CUDA format - - // map < , > - const std::map, - std::pair> - norm_channel_type_map{ - {{UR_IMAGE_CHANNEL_TYPE_UNORM_INT8, 1}, - {CU_AD_FORMAT_UNORM_INT8X1, 1}}, - {{UR_IMAGE_CHANNEL_TYPE_UNORM_INT8, 2}, - {CU_AD_FORMAT_UNORM_INT8X2, 2}}, - {{UR_IMAGE_CHANNEL_TYPE_UNORM_INT8, 4}, - {CU_AD_FORMAT_UNORM_INT8X4, 4}}, - - {{UR_IMAGE_CHANNEL_TYPE_SNORM_INT8, 1}, - {CU_AD_FORMAT_SNORM_INT8X1, 1}}, - {{UR_IMAGE_CHANNEL_TYPE_SNORM_INT8, 2}, - {CU_AD_FORMAT_SNORM_INT8X2, 2}}, - {{UR_IMAGE_CHANNEL_TYPE_SNORM_INT8, 4}, - {CU_AD_FORMAT_SNORM_INT8X4, 4}}, - - {{UR_IMAGE_CHANNEL_TYPE_UNORM_INT16, 1}, - {CU_AD_FORMAT_UNORM_INT16X1, 2}}, - {{UR_IMAGE_CHANNEL_TYPE_UNORM_INT16, 2}, - {CU_AD_FORMAT_UNORM_INT16X2, 4}}, - {{UR_IMAGE_CHANNEL_TYPE_UNORM_INT16, 4}, - {CU_AD_FORMAT_UNORM_INT16X4, 8}}, - - {{UR_IMAGE_CHANNEL_TYPE_SNORM_INT16, 1}, - {CU_AD_FORMAT_SNORM_INT16X1, 2}}, - {{UR_IMAGE_CHANNEL_TYPE_SNORM_INT16, 2}, - {CU_AD_FORMAT_SNORM_INT16X2, 4}}, - {{UR_IMAGE_CHANNEL_TYPE_SNORM_INT16, 4}, - {CU_AD_FORMAT_SNORM_INT16X4, 8}}, - }; - - try { - auto cuda_format_and_size = norm_channel_type_map.at( - std::make_pair(image_channel_type, num_channels)); - cuda_format = cuda_format_and_size.first; - pixel_size_bytes = cuda_format_and_size.second; - } catch (const std::out_of_range &) { - return UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT; - } - } - -#endif - if (return_cuda_format) { *return_cuda_format = cuda_format; } if (return_pixel_size_bytes) { *return_pixel_size_bytes = pixel_size_bytes; } + if (return_normalized_dtype_flag) { + *return_normalized_dtype_flag = normalized_dtype_flag; + } return UR_RESULT_SUCCESS; } @@ -189,46 +147,9 @@ cudaToUrImageChannelFormat(CUarray_format cuda_format, UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT); CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_FLOAT, UR_IMAGE_CHANNEL_TYPE_FLOAT); -#if CUDA_VERSION >= 11050 - - // Note that the CUDA UNORM and SNORM formats also encode the number of - // channels. - // Since UR does not encode this, we map different CUDA formats to the same - // UR channel type. - // Since this function is only called from `urBindlessImagesImageGetInfoExp` - // which has access to `CUDA_ARRAY3D_DESCRIPTOR`, we can determine the - // number of channels in the calling function. - - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT8X1, - UR_IMAGE_CHANNEL_TYPE_UNORM_INT8); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT8X2, - UR_IMAGE_CHANNEL_TYPE_UNORM_INT8); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT8X4, - UR_IMAGE_CHANNEL_TYPE_UNORM_INT8); - - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT16X1, - UR_IMAGE_CHANNEL_TYPE_UNORM_INT16); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT16X2, - UR_IMAGE_CHANNEL_TYPE_UNORM_INT16); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT16X4, - UR_IMAGE_CHANNEL_TYPE_UNORM_INT16); - - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT8X1, - UR_IMAGE_CHANNEL_TYPE_SNORM_INT8); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT8X2, - UR_IMAGE_CHANNEL_TYPE_SNORM_INT8); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT8X4, - UR_IMAGE_CHANNEL_TYPE_SNORM_INT8); - - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT16X1, - UR_IMAGE_CHANNEL_TYPE_SNORM_INT16); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT16X2, - UR_IMAGE_CHANNEL_TYPE_SNORM_INT16); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT16X4, - UR_IMAGE_CHANNEL_TYPE_SNORM_INT16); -#endif -#undef MAP default: + // Default invalid enum + *return_image_channel_type = UR_IMAGE_CHANNEL_TYPE_FORCE_UINT32; return UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT; } } @@ -236,6 +157,7 @@ cudaToUrImageChannelFormat(CUarray_format cuda_format, ur_result_t urTextureCreate(ur_sampler_handle_t hSampler, const ur_image_desc_t *pImageDesc, const CUDA_RESOURCE_DESC &ResourceDesc, + const unsigned int normalized_dtype_flag, ur_exp_image_native_handle_t *phRetImage) { try { @@ -306,8 +228,9 @@ ur_result_t urTextureCreate(ur_sampler_handle_t hSampler, // CUDA default promotes 8-bit and 16-bit integers to float between [0,1] // This flag prevents this behaviour. - ImageTexDesc.flags |= CU_TRSF_READ_AS_INTEGER; - + if (!normalized_dtype_flag) { + ImageTexDesc.flags |= CU_TRSF_READ_AS_INTEGER; + } // Cubemap attributes ur_exp_sampler_cubemap_filter_mode_t CubemapFilterModeProp = hSampler->getCubemapFilterMode(); @@ -315,8 +238,8 @@ ur_result_t urTextureCreate(ur_sampler_handle_t hSampler, #if CUDA_VERSION >= 11060 ImageTexDesc.flags |= CU_TRSF_SEAMLESS_CUBEMAP; #else - setErrorMessage("The " UR_EXP_SAMPLER_CUBEMAP_FILTER_MODE_SEAMLESS - " feature requires cuda 11.6 or later.", + setErrorMessage("The UR_EXP_SAMPLER_CUBEMAP_FILTER_MODE_SEAMLESS " + "feature requires cuda 11.6 or later.", UR_RESULT_ERROR_ADAPTER_SPECIFIC); return UR_RESULT_ERROR_ADAPTER_SPECIFIC; #endif @@ -413,9 +336,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp( UR_CHECK_ERROR(urCalculateNumChannels(pImageFormat->channelOrder, &array_desc.NumChannels)); - UR_CHECK_ERROR(urToCudaImageChannelFormat(pImageFormat->channelType, - pImageFormat->channelOrder, - &array_desc.Format, nullptr)); + UR_CHECK_ERROR(urToCudaImageChannelFormat( + pImageFormat->channelType, pImageFormat->channelOrder, &array_desc.Format, + nullptr, nullptr)); array_desc.Flags = 0; // No flags required array_desc.Width = pImageDesc->width; @@ -455,21 +378,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp( // Allocate a cuArray if (pImageDesc->numMipLevel == 1) { - CUarray ImageArray; + CUarray ImageArray{}; try { UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &array_desc)); *phImageMem = (ur_exp_image_mem_native_handle_t)ImageArray; } catch (ur_result_t Err) { - cuArrayDestroy(ImageArray); + if (ImageArray != CUarray{}) { + UR_CHECK_ERROR(cuArrayDestroy(ImageArray)); + } return Err; } catch (...) { - cuArrayDestroy(ImageArray); + if (ImageArray != CUarray{}) { + UR_CHECK_ERROR(cuArrayDestroy(ImageArray)); + } return UR_RESULT_ERROR_UNKNOWN; } } else // Allocate a cuMipmappedArray { - CUmipmappedArray mip_array; + CUmipmappedArray mip_array{}; array_desc.Flags = CUDA_ARRAY3D_SURFACE_LDST; try { @@ -477,10 +404,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp( pImageDesc->numMipLevel)); *phImageMem = (ur_exp_image_mem_native_handle_t)mip_array; } catch (ur_result_t Err) { - cuMipmappedArrayDestroy(mip_array); + if (mip_array) { + UR_CHECK_ERROR(cuMipmappedArrayDestroy(mip_array)); + } return Err; } catch (...) { - cuMipmappedArrayDestroy(mip_array); + if (mip_array) { + UR_CHECK_ERROR(cuMipmappedArrayDestroy(mip_array)); + } return UR_RESULT_ERROR_UNKNOWN; } } @@ -526,7 +457,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( size_t PixelSizeBytes; UR_CHECK_ERROR(urToCudaImageChannelFormat(pImageFormat->channelType, pImageFormat->channelOrder, &format, - &PixelSizeBytes)); + &PixelSizeBytes, nullptr)); try { @@ -571,9 +502,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( CUarray_format format; size_t PixelSizeBytes; - UR_CHECK_ERROR(urToCudaImageChannelFormat(pImageFormat->channelType, - pImageFormat->channelOrder, &format, - &PixelSizeBytes)); + unsigned int normalized_dtype_flag; + UR_CHECK_ERROR(urToCudaImageChannelFormat( + pImageFormat->channelType, pImageFormat->channelOrder, &format, + &PixelSizeBytes, &normalized_dtype_flag)); try { CUDA_RESOURCE_DESC image_res_desc = {}; @@ -622,8 +554,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( return UR_RESULT_ERROR_INVALID_VALUE; } - UR_CHECK_ERROR( - urTextureCreate(hSampler, pImageDesc, image_res_desc, phImage)); + UR_CHECK_ERROR(urTextureCreate(hSampler, pImageDesc, image_res_desc, + normalized_dtype_flag, phImage)); } catch (ur_result_t Err) { return Err; @@ -649,6 +581,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( UR_ASSERT(pSrcImageFormat->channelOrder == pDstImageFormat->channelOrder, UR_RESULT_ERROR_INVALID_ARGUMENT); + auto as_CUArray = [](const void *ptr) { + return static_cast(const_cast(ptr)); + }; + unsigned int NumChannels = 0; size_t PixelSizeBytes = 0; @@ -659,7 +595,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( // later. UR_CHECK_ERROR(urToCudaImageChannelFormat(pSrcImageFormat->channelType, pSrcImageFormat->channelOrder, - nullptr, &PixelSizeBytes)); + nullptr, &PixelSizeBytes, nullptr)); try { ScopedContext Active(hQueue->getDevice()); @@ -751,13 +687,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; cpy_desc.srcHost = pSrc; - cpy_desc.srcPitch = pCopyRegion->copyExtent.width * PixelSizeBytes; - cpy_desc.srcHeight = pCopyRegion->copyExtent.height; + cpy_desc.srcPitch = pSrcImageDesc->width * PixelSizeBytes; + cpy_desc.srcHeight = std::max(uint64_t{1}, pSrcImageDesc->height); cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; cpy_desc.dstArray = (CUarray)pDst; cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; cpy_desc.Height = std::max(uint64_t{1}, pCopyRegion->copyExtent.height); - cpy_desc.Depth = pDstImageDesc->arraySize; + cpy_desc.Depth = pCopyRegion->copyExtent.depth; UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream)); } } else if (imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST) { @@ -777,7 +713,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( if (isCudaArray) { UR_CHECK_ERROR( - cuMemcpyAtoHAsync(DstWithOffset, (CUarray)pSrc, + cuMemcpyAtoHAsync(DstWithOffset, as_CUArray(pSrc), PixelSizeBytes * pCopyRegion->srcOffset.x, CopyExtentBytes, Stream)); } else if (memType == CU_MEMORYTYPE_DEVICE) { @@ -801,7 +737,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstHost = pDst; if (pSrcImageDesc->rowPitch == 0) { cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; - cpy_desc.srcArray = (CUarray)pSrc; + cpy_desc.srcArray = as_CUArray(pSrc); } else { // Pitched memory cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE; @@ -823,7 +759,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; - cpy_desc.srcArray = (CUarray)pSrc; + cpy_desc.srcArray = as_CUArray(pSrc); cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; cpy_desc.dstHost = pDst; cpy_desc.dstPitch = pDstImageDesc->width * PixelSizeBytes; @@ -843,14 +779,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; - cpy_desc.srcArray = (CUarray)pSrc; + cpy_desc.srcArray = as_CUArray(pSrc); cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; cpy_desc.dstHost = pDst; cpy_desc.dstPitch = pDstImageDesc->width * PixelSizeBytes; - cpy_desc.dstHeight = pDstImageDesc->height; + cpy_desc.dstHeight = std::max(uint64_t{1}, pDstImageDesc->height); cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; cpy_desc.Height = std::max(uint64_t{1}, pCopyRegion->copyExtent.height); - cpy_desc.Depth = pSrcImageDesc->arraySize; + cpy_desc.Depth = pCopyRegion->copyExtent.depth; UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream)); } } else { @@ -873,7 +809,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; cpy_desc.dstY = 0; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; - cpy_desc.srcArray = (CUarray)pSrc; + cpy_desc.srcArray = as_CUArray(pSrc); cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; cpy_desc.dstArray = (CUarray)pDst; cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; @@ -886,7 +822,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; - cpy_desc.srcArray = (CUarray)pSrc; + cpy_desc.srcArray = as_CUArray(pSrc); cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; cpy_desc.dstArray = (CUarray)pDst; cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; @@ -901,7 +837,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; - cpy_desc.srcArray = (CUarray)pSrc; + cpy_desc.srcArray = as_CUArray(pSrc); cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; cpy_desc.dstArray = (CUarray)pDst; cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; @@ -919,12 +855,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; - cpy_desc.srcArray = (CUarray)pSrc; + cpy_desc.srcArray = as_CUArray(pSrc); cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; cpy_desc.dstArray = (CUarray)pDst; cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; cpy_desc.Height = std::max(uint64_t{1}, pCopyRegion->copyExtent.height); - cpy_desc.Depth = pSrcImageDesc->arraySize; + cpy_desc.Depth = pCopyRegion->copyExtent.depth; UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream)); } // Synchronization is required here to handle the case of copying data @@ -953,8 +889,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageGetInfoExp( ur_context_handle_t, ur_exp_image_mem_native_handle_t hImageMem, ur_image_info_t propName, void *pPropValue, size_t *pPropSizeRet) { + CUarray hCUarray; + CUresult Err = cuMipmappedArrayGetLevel( + &hCUarray, reinterpret_cast(hImageMem), 0); + + // If cuMipmappedArrayGetLevel failed, hImageMem is already CUarray. + if (Err != CUDA_SUCCESS) { + hCUarray = reinterpret_cast(hImageMem); + } + CUDA_ARRAY3D_DESCRIPTOR ArrayDesc; - UR_CHECK_ERROR(cuArray3DGetDescriptor(&ArrayDesc, (CUarray)hImageMem)); + UR_CHECK_ERROR(cuArray3DGetDescriptor(&ArrayDesc, hCUarray)); switch (propName) { case UR_IMAGE_INFO_WIDTH: if (pPropValue) { @@ -1058,8 +1003,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMipmapFreeExp( UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size, ur_exp_external_mem_type_t memHandleType, - ur_exp_interop_mem_desc_t *pInteropMemDesc, - ur_exp_interop_mem_handle_t *phInteropMem) { + ur_exp_external_mem_desc_t *pExternalMemDesc, + ur_exp_external_mem_handle_t *phExternalMem) { UR_ASSERT(std::find(hContext->getDevices().begin(), hContext->getDevices().end(), hDevice) != hContext->getDevices().end(), @@ -1071,7 +1016,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( CUDA_EXTERNAL_MEMORY_HANDLE_DESC extMemDesc = {}; extMemDesc.size = size; - void *pNext = const_cast(pInteropMemDesc->pNext); + void *pNext = const_cast(pExternalMemDesc->pNext); while (pNext != nullptr) { const ur_base_desc_t *BaseDesc = static_cast(pNext); @@ -1103,7 +1048,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( CUexternalMemory extMem; UR_CHECK_ERROR(cuImportExternalMemory(&extMem, &extMemDesc)); - *phInteropMem = (ur_exp_interop_mem_handle_t)extMem; + *phExternalMem = (ur_exp_external_mem_handle_t)extMem; } catch (ur_result_t Err) { return Err; @@ -1117,7 +1062,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, - ur_exp_interop_mem_handle_t hInteropMem, + ur_exp_external_mem_handle_t hExternalMem, ur_exp_image_mem_native_handle_t *phImageMem) { UR_ASSERT(std::find(hContext->getDevices().begin(), hContext->getDevices().end(), @@ -1129,8 +1074,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( urCalculateNumChannels(pImageFormat->channelOrder, &NumChannels)); CUarray_format format; - UR_CHECK_ERROR(urToCudaImageChannelFormat( - pImageFormat->channelType, pImageFormat->channelOrder, &format, nullptr)); + UR_CHECK_ERROR(urToCudaImageChannelFormat(pImageFormat->channelType, + pImageFormat->channelOrder, &format, + nullptr, nullptr)); try { ScopedContext Active(hDevice); @@ -1150,7 +1096,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( // If desired, a CUarray is retrieved from the mipmaps 0th level CUmipmappedArray memMipMap; UR_CHECK_ERROR(cuExternalMemoryGetMappedMipmappedArray( - &memMipMap, (CUexternalMemory)hInteropMem, &mipmapDesc)); + &memMipMap, (CUexternalMemory)hExternalMem, &mipmapDesc)); if (pImageDesc->numMipLevel > 1) { *phImageMem = (ur_exp_image_mem_native_handle_t)memMipMap; @@ -1169,9 +1115,39 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, uint64_t offset, + uint64_t size, ur_exp_external_mem_handle_t hExternalMem, void **ppRetMem) { + UR_ASSERT(std::find(hContext->getDevices().begin(), + hContext->getDevices().end(), + hDevice) != hContext->getDevices().end(), + UR_RESULT_ERROR_INVALID_CONTEXT); + + try { + ScopedContext Active(hDevice); + + CUDA_EXTERNAL_MEMORY_BUFFER_DESC BufferDesc = {}; + BufferDesc.size = size; + BufferDesc.offset = offset; + BufferDesc.flags = 0; + + CUdeviceptr retMem; + UR_CHECK_ERROR(cuExternalMemoryGetMappedBuffer( + &retMem, (CUexternalMemory)hExternalMem, &BufferDesc)); + + *ppRetMem = (void *)retMem; + + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, - ur_exp_interop_mem_handle_t hInteropMem) { + ur_exp_external_mem_handle_t hExternalMem) { UR_ASSERT(std::find(hContext->getDevices().begin(), hContext->getDevices().end(), hDevice) != hContext->getDevices().end(), @@ -1179,7 +1155,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( try { ScopedContext Active(hDevice); - UR_CHECK_ERROR(cuDestroyExternalMemory((CUexternalMemory)hInteropMem)); + UR_CHECK_ERROR(cuDestroyExternalMemory((CUexternalMemory)hExternalMem)); } catch (ur_result_t Err) { return Err; } catch (...) { @@ -1191,8 +1167,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_exp_external_semaphore_type_t semHandleType, - ur_exp_interop_semaphore_desc_t *pInteropSemaphoreDesc, - ur_exp_interop_semaphore_handle_t *phInteropSemaphoreHandle) { + ur_exp_external_semaphore_desc_t *pExternalSemaphoreDesc, + ur_exp_external_semaphore_handle_t *phExternalSemaphoreHandle) { UR_ASSERT(std::find(hContext->getDevices().begin(), hContext->getDevices().end(), hDevice) != hContext->getDevices().end(), @@ -1203,7 +1179,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC extSemDesc = {}; - void *pNext = const_cast(pInteropSemaphoreDesc->pNext); + void *pNext = const_cast(pExternalSemaphoreDesc->pNext); while (pNext != nullptr) { const ur_base_desc_t *BaseDesc = static_cast(pNext); @@ -1234,7 +1210,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( CUexternalSemaphore semaphore; UR_CHECK_ERROR(cuImportExternalSemaphore(&semaphore, &extSemDesc)); - *phInteropSemaphoreHandle = (ur_exp_interop_semaphore_handle_t)semaphore; + *phExternalSemaphoreHandle = (ur_exp_external_semaphore_handle_t)semaphore; } catch (ur_result_t Err) { return Err; } catch (...) { @@ -1245,7 +1221,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, - ur_exp_interop_semaphore_handle_t hInteropSemaphore) { + ur_exp_external_semaphore_handle_t hExternalSemaphore) { UR_ASSERT(std::find(hContext->getDevices().begin(), hContext->getDevices().end(), hDevice) != hContext->getDevices().end(), @@ -1254,7 +1230,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( try { ScopedContext Active(hDevice); UR_CHECK_ERROR( - cuDestroyExternalSemaphore((CUexternalSemaphore)hInteropSemaphore)); + cuDestroyExternalSemaphore((CUexternalSemaphore)hExternalSemaphore)); } catch (ur_result_t Err) { return Err; } catch (...) { @@ -1264,7 +1240,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( } UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( - ur_queue_handle_t hQueue, ur_exp_interop_semaphore_handle_t hSemaphore, + ur_queue_handle_t hQueue, ur_exp_external_semaphore_handle_t hSemaphore, bool hasValue, uint64_t waitValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { @@ -1286,7 +1262,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( if (phEvent) { auto NewEvent = ur_event_handle_t_::makeNative( - UR_COMMAND_INTEROP_SEMAPHORE_WAIT_EXP, hQueue, Stream); + UR_COMMAND_EXTERNAL_SEMAPHORE_WAIT_EXP, hQueue, Stream); NewEvent->record(); *phEvent = NewEvent; } @@ -1299,7 +1275,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( } UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( - ur_queue_handle_t hQueue, ur_exp_interop_semaphore_handle_t hSemaphore, + ur_queue_handle_t hQueue, ur_exp_external_semaphore_handle_t hSemaphore, bool hasValue, uint64_t signalValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { @@ -1321,7 +1297,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( if (phEvent) { auto NewEvent = ur_event_handle_t_::makeNative( - UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP, hQueue, Stream); + UR_COMMAND_EXTERNAL_SEMAPHORE_SIGNAL_EXP, hQueue, Stream); NewEvent->record(); *phEvent = NewEvent; } diff --git a/source/adapters/cuda/image.hpp b/source/adapters/cuda/image.hpp index 162885a44e..7233d1785c 100644 --- a/source/adapters/cuda/image.hpp +++ b/source/adapters/cuda/image.hpp @@ -21,14 +21,15 @@ ur_result_t urToCudaImageChannelFormat(ur_image_channel_type_t image_channel_type, ur_image_channel_order_t image_channel_order, CUarray_format *return_cuda_format, - size_t *return_pixel_types_size_bytes); + size_t *return_pixel_types_size_bytes, + unsigned int *return_normalized_dtype_flag); ur_result_t cudaToUrImageChannelFormat(CUarray_format cuda_format, ur_image_channel_type_t *return_image_channel_type); -ur_result_t urTextureCreate(ur_context_handle_t hContext, - ur_sampler_desc_t SamplerDesc, +ur_result_t urTextureCreate(ur_sampler_handle_t hSampler, const ur_image_desc_t *pImageDesc, - CUDA_RESOURCE_DESC ResourceDesc, + const CUDA_RESOURCE_DESC &ResourceDesc, + const unsigned int normalized_dtype_flag, ur_exp_image_native_handle_t *phRetImage); diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp index d43bd046dc..5fb097c304 100644 --- a/source/adapters/cuda/kernel.cpp +++ b/source/adapters/cuda/kernel.cpp @@ -13,6 +13,7 @@ #include "memory.hpp" #include "queue.hpp" #include "sampler.hpp" +#include "ur_api.h" UR_APIEXPORT ur_result_t UR_APICALL urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName, @@ -124,6 +125,30 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, &Bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, hKernel->get())); return ReturnValue(uint64_t(Bytes)); } + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE: { + size_t MaxGroupSize[3] = {0, 0, 0}; + const auto &MaxWGSizeMDMap = + hKernel->getProgram()->KernelMaxWorkGroupSizeMD; + const auto MaxWGSizeMD = MaxWGSizeMDMap.find(hKernel->getName()); + if (MaxWGSizeMD != MaxWGSizeMDMap.end()) { + const auto MaxWGSize = MaxWGSizeMD->second; + MaxGroupSize[0] = std::get<0>(MaxWGSize); + MaxGroupSize[1] = std::get<1>(MaxWGSize); + MaxGroupSize[2] = std::get<2>(MaxWGSize); + } + return ReturnValue(MaxGroupSize, 3); + } + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE: { + size_t MaxLinearGroupSize = 0; + const auto &MaxLinearWGSizeMDMap = + hKernel->getProgram()->KernelMaxLinearWorkGroupSizeMD; + const auto MaxLinearWGSizeMD = + MaxLinearWGSizeMDMap.find(hKernel->getName()); + if (MaxLinearWGSizeMD != MaxLinearWGSizeMDMap.end()) { + MaxLinearGroupSize = MaxLinearWGSizeMD->second; + } + return ReturnValue(MaxLinearGroupSize); + } default: break; } @@ -167,10 +192,46 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, size_t localWorkSize, size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) { - (void)hKernel; - (void)localWorkSize; - (void)dynamicSharedMemorySize; - *pGroupCountRet = 1; + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_KERNEL); + + // We need to set the active current device for this kernel explicitly here, + // because the occupancy querying API does not take device parameter. + ur_device_handle_t Device = hKernel->getProgram()->getDevice(); + ScopedContext Active(Device); + try { + // We need to calculate max num of work-groups using per-device semantics. + + int MaxNumActiveGroupsPerCU{0}; + UR_CHECK_ERROR(cuOccupancyMaxActiveBlocksPerMultiprocessor( + &MaxNumActiveGroupsPerCU, hKernel->get(), localWorkSize, + dynamicSharedMemorySize)); + detail::ur::assertion(MaxNumActiveGroupsPerCU >= 0); + // Handle the case where we can't have all SMs active with at least 1 group + // per SM. In that case, the device is still able to run 1 work-group, hence + // we will manually check if it is possible with the available HW resources. + if (MaxNumActiveGroupsPerCU == 0) { + size_t MaxWorkGroupSize{}; + urKernelGetGroupInfo( + hKernel, Device, UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE, + sizeof(MaxWorkGroupSize), &MaxWorkGroupSize, nullptr); + size_t MaxLocalSizeBytes{}; + urDeviceGetInfo(Device, UR_DEVICE_INFO_LOCAL_MEM_SIZE, + sizeof(MaxLocalSizeBytes), &MaxLocalSizeBytes, nullptr); + if (localWorkSize > MaxWorkGroupSize || + dynamicSharedMemorySize > MaxLocalSizeBytes || + hasExceededMaxRegistersPerBlock(Device, hKernel, localWorkSize)) + *pGroupCountRet = 0; + else + *pGroupCountRet = 1; + } else { + // Multiply by the number of SMs (CUs = compute units) on the device in + // order to retreive the total number of groups/blocks that can be + // launched. + *pGroupCountRet = Device->getNumComputeUnits() * MaxNumActiveGroupsPerCU; + } + } catch (ur_result_t Err) { + return Err; + } return UR_RESULT_SUCCESS; } @@ -307,7 +368,8 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, try { auto Device = hKernel->getProgram()->getDevice(); ur_mem_flags_t MemAccess = - Properties ? Properties->memoryAccess : UR_MEM_FLAG_READ_WRITE; + Properties ? Properties->memoryAccess + : static_cast(UR_MEM_FLAG_READ_WRITE); hKernel->Args.addMemObjArg(argIndex, hArgValue, MemAccess); if (hArgValue->isImage()) { CUDA_ARRAY3D_DESCRIPTOR arrayDesc; @@ -410,3 +472,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( pSuggestedLocalWorkSize); return Result; } + +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants( + ur_kernel_handle_t, uint32_t, const ur_specialization_constant_info_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp index c6761d8525..7ad20a4f0e 100644 --- a/source/adapters/cuda/kernel.hpp +++ b/source/adapters/cuda/kernel.hpp @@ -46,6 +46,8 @@ struct ur_kernel_handle_t_ { static constexpr uint32_t ReqdThreadsPerBlockDimensions = 3u; size_t ReqdThreadsPerBlock[ReqdThreadsPerBlockDimensions]; + size_t MaxThreadsPerBlock[ReqdThreadsPerBlockDimensions]; + size_t MaxLinearThreadsPerBlock{0}; int RegsPerThread{0}; /// Structure that holds the arguments to the kernel. @@ -169,6 +171,18 @@ struct ur_kernel_handle_t_ { sizeof(ReqdThreadsPerBlock), ReqdThreadsPerBlock, nullptr); (void)RetError; assert(RetError == UR_RESULT_SUCCESS); + /// Note: this code assumes that there is only one device per context + RetError = urKernelGetGroupInfo( + this, Program->getDevice(), + UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE, + sizeof(MaxThreadsPerBlock), MaxThreadsPerBlock, nullptr); + assert(RetError == UR_RESULT_SUCCESS); + /// Note: this code assumes that there is only one device per context + RetError = urKernelGetGroupInfo( + this, Program->getDevice(), + UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE, + sizeof(MaxLinearThreadsPerBlock), &MaxLinearThreadsPerBlock, nullptr); + assert(RetError == UR_RESULT_SUCCESS); UR_CHECK_ERROR( cuFuncGetAttribute(&RegsPerThread, CU_FUNC_ATTRIBUTE_NUM_REGS, Func)); } diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp index 1aefb15a3d..ea55c1669a 100644 --- a/source/adapters/cuda/memory.cpp +++ b/source/adapters/cuda/memory.cpp @@ -439,7 +439,7 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem, UR_CHECK_ERROR(cuMemAlloc(&DevPtr, Buffer.Size)); } } else { - CUarray ImageArray; + CUarray ImageArray{}; CUsurfObject Surface; try { auto &Image = std::get(Mem->Mem); @@ -465,12 +465,12 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem, UR_CHECK_ERROR(cuSurfObjectCreate(&Surface, &ImageResDesc)); Image.SurfObjs[DeviceIdx] = Surface; } catch (ur_result_t Err) { - if (ImageArray) { + if (ImageArray != CUarray{}) { UR_CHECK_ERROR(cuArrayDestroy(ImageArray)); } return Err; } catch (...) { - if (ImageArray) { + if (ImageArray != CUarray{}) { UR_CHECK_ERROR(cuArrayDestroy(ImageArray)); } return UR_RESULT_ERROR_UNKNOWN; diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp index aa992f44bf..6dcaa28414 100644 --- a/source/adapters/cuda/memory.hpp +++ b/source/adapters/cuda/memory.hpp @@ -197,20 +197,15 @@ struct SurfaceMem { void *HostPtr) : Arrays(Context->Devices.size(), CUarray{0}), SurfObjs(Context->Devices.size(), CUsurfObject{0}), - OuterMemStruct{OuterMemStruct}, - ImageFormat{ImageFormat}, ImageDesc{ImageDesc}, HostPtr{HostPtr} { + OuterMemStruct{OuterMemStruct}, ImageDesc{ImageDesc}, ArrayDesc{}, + HostPtr{HostPtr} { // We have to use hipArray3DCreate, which has some caveats. The height and // depth parameters must be set to 0 produce 1D or 2D arrays. image_desc // gives a minimum value of 1, so we need to convert the answer. ArrayDesc.NumChannels = 4; // Only support 4 channel image - ArrayDesc.Flags = 0; // No flags required ArrayDesc.Width = ImageDesc.width; - if (ImageDesc.type == UR_MEM_TYPE_IMAGE1D) { - ArrayDesc.Height = 0; - ArrayDesc.Depth = 0; - } else if (ImageDesc.type == UR_MEM_TYPE_IMAGE2D) { + if (ImageDesc.type == UR_MEM_TYPE_IMAGE2D) { ArrayDesc.Height = ImageDesc.height; - ArrayDesc.Depth = 0; } else if (ImageDesc.type == UR_MEM_TYPE_IMAGE3D) { ArrayDesc.Height = ImageDesc.height; ArrayDesc.Depth = ImageDesc.depth; @@ -414,10 +409,14 @@ struct ur_mem_handle_t_ { } ur_result_t clear() { - if (isBuffer()) { - return std::get(Mem).clear(); + try { + if (isBuffer()) { + return std::get(Mem).clear(); + } + return std::get(Mem).clear(); + } catch (const ur_result_t &error) { + return error; } - return std::get(Mem).clear(); } ur_context_handle_t getContext() const noexcept { return Context; } diff --git a/source/adapters/cuda/program.cpp b/source/adapters/cuda/program.cpp index 98757d710e..a475d43ce2 100644 --- a/source/adapters/cuda/program.cpp +++ b/source/adapters/cuda/program.cpp @@ -11,6 +11,17 @@ #include "program.hpp" #include "ur_util.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + bool getMaxRegistersJitOptionValue(const std::string &BuildOptions, unsigned int &Value) { using namespace std::string_view_literals; @@ -54,9 +65,10 @@ ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata, auto [Prefix, Tag] = splitMetadataName(MetadataElementName); - if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) { - // If metadata is reqd_work_group_size, record it for the corresponding - // kernel name. + if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE || + Tag == __SYCL_UR_PROGRAM_METADATA_TAG_MAX_WORK_GROUP_SIZE) { + // If metadata is reqd_work_group_size/max_work_group_size, record it for + // the corresponding kernel name. size_t MDElemsSize = MetadataElement.size - sizeof(std::uint64_t); // Expect between 1 and 3 32-bit integer values. @@ -69,11 +81,13 @@ ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata, reinterpret_cast(MetadataElement.value.pData) + sizeof(std::uint64_t); // Read values and pad with 1's for values not present. - std::uint32_t ReqdWorkGroupElements[] = {1, 1, 1}; - std::memcpy(ReqdWorkGroupElements, ValuePtr, MDElemsSize); - KernelReqdWorkGroupSizeMD[Prefix] = - std::make_tuple(ReqdWorkGroupElements[0], ReqdWorkGroupElements[1], - ReqdWorkGroupElements[2]); + std::array WorkGroupElements = {1, 1, 1}; + std::memcpy(WorkGroupElements.data(), ValuePtr, MDElemsSize); + (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE + ? KernelReqdWorkGroupSizeMD + : KernelMaxWorkGroupSizeMD)[Prefix] = + std::make_tuple(WorkGroupElements[0], WorkGroupElements[1], + WorkGroupElements[2]); } else if (Tag == __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING) { const char *MetadataValPtr = reinterpret_cast(MetadataElement.value.pData) + @@ -81,6 +95,9 @@ ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata, const char *MetadataValPtrEnd = MetadataValPtr + MetadataElement.size - sizeof(std::uint64_t); GlobalIDMD[Prefix] = std::string{MetadataValPtr, MetadataValPtrEnd}; + } else if (Tag == + __SYCL_UR_PROGRAM_METADATA_TAG_MAX_LINEAR_WORK_GROUP_SIZE) { + KernelMaxLinearWorkGroupSizeMD[Prefix] = MetadataElement.value.data64; } } return UR_RESULT_SUCCESS; @@ -399,8 +416,6 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName, return ReturnValue(1u); case UR_PROGRAM_INFO_DEVICES: return ReturnValue(&hProgram->Device, 1); - case UR_PROGRAM_INFO_SOURCE: - return ReturnValue(hProgram->Binary); case UR_PROGRAM_INFO_BINARY_SIZES: return ReturnValue(&hProgram->BinarySizeInBytes, 1); case UR_PROGRAM_INFO_BINARIES: @@ -410,6 +425,7 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName, UR_ASSERT(getKernelNames(hProgram), UR_RESULT_ERROR_UNSUPPORTED_FEATURE); return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; case UR_PROGRAM_INFO_NUM_KERNELS: + case UR_PROGRAM_INFO_IL: return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: break; diff --git a/source/adapters/cuda/program.hpp b/source/adapters/cuda/program.hpp index 5d41374d34..10998cae2c 100644 --- a/source/adapters/cuda/program.hpp +++ b/source/adapters/cuda/program.hpp @@ -36,6 +36,9 @@ struct ur_program_handle_t_ { std::unordered_map> KernelReqdWorkGroupSizeMD; std::unordered_map GlobalIDMD; + std::unordered_map> + KernelMaxWorkGroupSizeMD; + std::unordered_map KernelMaxLinearWorkGroupSizeMD; constexpr static size_t MaxLogSize = 8192u; @@ -45,7 +48,8 @@ struct ur_program_handle_t_ { ur_program_handle_t_(ur_context_handle_t Context, ur_device_handle_t Device) : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1}, - Context{Context}, Device{Device}, KernelReqdWorkGroupSizeMD{} { + Context{Context}, Device{Device}, KernelReqdWorkGroupSizeMD{}, + KernelMaxWorkGroupSizeMD{}, KernelMaxLinearWorkGroupSizeMD{} { urContextRetain(Context); urDeviceRetain(Device); } diff --git a/source/adapters/cuda/queue.cpp b/source/adapters/cuda/queue.cpp index bd92a01400..548940f853 100644 --- a/source/adapters/cuda/queue.cpp +++ b/source/adapters/cuda/queue.cpp @@ -45,7 +45,8 @@ CUstream ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) { // change NumComputeStreams after that if (NumComputeStreams < ComputeStreams.size()) { UR_CHECK_ERROR(cuStreamCreateWithPriority( - &ComputeStreams[NumComputeStreams++], Flags, Priority)); + &ComputeStreams[NumComputeStreams], Flags, Priority)); + ++NumComputeStreams; } } Token = ComputeStreamIndex++; @@ -110,7 +111,8 @@ CUstream ur_queue_handle_t_::getNextTransferStream() { // change NumTransferStreams after that if (NumTransferStreams < TransferStreams.size()) { UR_CHECK_ERROR(cuStreamCreateWithPriority( - &TransferStreams[NumTransferStreams++], Flags, Priority)); + &TransferStreams[NumTransferStreams], Flags, Priority)); + ++NumTransferStreams; } } uint32_t StreamI = TransferStreamIndex++ % TransferStreams.size(); @@ -263,7 +265,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( ur_native_handle_t hNativeQueue, ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_queue_native_properties_t *pProperties, ur_queue_handle_t *phQueue) { - (void)hDevice; + if (!hDevice && hContext->getDevices().size() == 1) + hDevice = hContext->getDevices().front(); unsigned int CuFlags; CUstream CuStream = reinterpret_cast(hNativeQueue); diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp index 97d19640b2..a9559eb188 100644 --- a/source/adapters/cuda/ur_interface_loader.cpp +++ b/source/adapters/cuda/ur_interface_loader.cpp @@ -124,7 +124,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetArgSampler = urKernelSetArgSampler; pDdiTable->pfnSetArgValue = urKernelSetArgValue; pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; - pDdiTable->pfnSetSpecializationConstants = nullptr; + pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; return UR_RESULT_SUCCESS; } @@ -301,7 +301,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( pDdiTable->pfnCommandGetInfoExp = urCommandBufferCommandGetInfoExp; pDdiTable->pfnReleaseCommandExp = urCommandBufferReleaseCommandExp; pDdiTable->pfnRetainCommandExp = urCommandBufferRetainCommandExp; - + pDdiTable->pfnUpdateWaitEventsExp = urCommandBufferUpdateWaitEventsExp; + pDdiTable->pfnUpdateSignalEventExp = urCommandBufferUpdateSignalEventExp; return retVal; } @@ -340,7 +341,10 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable( pDdiTable->pfnImportExternalMemoryExp = urBindlessImagesImportExternalMemoryExp; pDdiTable->pfnMapExternalArrayExp = urBindlessImagesMapExternalArrayExp; - pDdiTable->pfnReleaseInteropExp = urBindlessImagesReleaseInteropExp; + pDdiTable->pfnMapExternalLinearMemoryExp = + urBindlessImagesMapExternalLinearMemoryExp; + pDdiTable->pfnReleaseExternalMemoryExp = + urBindlessImagesReleaseExternalMemoryExp; pDdiTable->pfnImportExternalSemaphoreExp = urBindlessImagesImportExternalSemaphoreExp; pDdiTable->pfnReleaseExternalSemaphoreExp = diff --git a/source/adapters/cuda/usm.cpp b/source/adapters/cuda/usm.cpp index a7b3a60eac..8a6ac41b08 100644 --- a/source/adapters/cuda/usm.cpp +++ b/source/adapters/cuda/usm.cpp @@ -22,6 +22,13 @@ #include +namespace umf { +ur_result_t getProviderNativeError(const char *, int32_t) { + // TODO: implement when UMF supports CUDA + return UR_RESULT_ERROR_UNKNOWN; +} +} // namespace umf + /// USM: Implements USM Host allocations using CUDA Pinned Memory /// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#page-locked-host-memory UR_APIEXPORT ur_result_t UR_APICALL @@ -398,7 +405,7 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, HostMemPool = umf::poolMakeUniqueFromOps( - &UMF_DISJOINT_POOL_OPS, std::move(MemProvider), + umfDisjointPoolOps(), std::move(MemProvider), &this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Host]) .second; @@ -407,7 +414,7 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, umf::memoryProviderMakeUnique(Context, Device) .second; DeviceMemPool = umf::poolMakeUniqueFromOps( - &UMF_DISJOINT_POOL_OPS, std::move(MemProvider), + umfDisjointPoolOps(), std::move(MemProvider), &this->DisjointPoolConfigs .Configs[usm::DisjointPoolMemType::Device]) .second; @@ -415,7 +422,7 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, umf::memoryProviderMakeUnique(Context, Device) .second; SharedMemPool = umf::poolMakeUniqueFromOps( - &UMF_DISJOINT_POOL_OPS, std::move(MemProvider), + umfDisjointPoolOps(), std::move(MemProvider), &this->DisjointPoolConfigs .Configs[usm::DisjointPoolMemType::Shared]) .second; diff --git a/source/adapters/cuda/usm.hpp b/source/adapters/cuda/usm.hpp index e5d1f7fbaa..7c6a2ea666 100644 --- a/source/adapters/cuda/usm.hpp +++ b/source/adapters/cuda/usm.hpp @@ -81,6 +81,12 @@ class USMMemoryProvider { umf_result_t purge_force(void *, size_t) { return UMF_RESULT_ERROR_NOT_SUPPORTED; }; + umf_result_t allocation_merge(void *, void *, size_t) { + return UMF_RESULT_ERROR_UNKNOWN; + } + umf_result_t allocation_split(void *, size_t, size_t) { + return UMF_RESULT_ERROR_UNKNOWN; + } virtual const char *get_name() = 0; virtual ~USMMemoryProvider() = default; diff --git a/source/adapters/hip/CMakeLists.txt b/source/adapters/hip/CMakeLists.txt index 39eb80a6c9..36222907c6 100644 --- a/source/adapters/hip/CMakeLists.txt +++ b/source/adapters/hip/CMakeLists.txt @@ -8,8 +8,13 @@ set(TARGET_NAME ur_adapter_hip) # Set default UR HIP platform to AMD set(UR_HIP_PLATFORM "AMD" CACHE STRING "UR HIP platform, AMD or NVIDIA") +set(DEFAULT_ROCM_PATH "/opt/rocm") +if(DEFINED ENV{ROCM_PATH}) + set(DEFAULT_ROCM_PATH $ENV{ROCM_PATH}) +endif() + # Set default ROCm installation directory -set(UR_HIP_ROCM_DIR "/opt/rocm" CACHE STRING "ROCm installation dir") +set(UR_HIP_ROCM_DIR "${DEFAULT_ROCM_PATH}" CACHE STRING "ROCm installation dir") # Allow custom location of HIP/HSA include and HIP library directories set(UR_HIP_INCLUDE_DIR "${UR_HIP_ROCM_DIR}/include" CACHE PATH "Custom ROCm HIP include dir") @@ -88,6 +93,7 @@ add_ur_adapter(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.hpp ) +install_ur_library(${TARGET_NAME}) if(NOT MSVC) target_compile_options(${TARGET_NAME} PRIVATE @@ -141,6 +147,7 @@ if("${UR_HIP_PLATFORM}" STREQUAL "AMD") target_link_libraries(${TARGET_NAME} PRIVATE ${PROJECT_NAME}::headers ${PROJECT_NAME}::common + ${PROJECT_NAME}::umf rocmdrv ) @@ -175,6 +182,7 @@ elseif("${UR_HIP_PLATFORM}" STREQUAL "NVIDIA") target_link_libraries(${TARGET_NAME} PRIVATE ${PROJECT_NAME}::headers ${PROJECT_NAME}::common + ${PROJECT_NAME}::umf Threads::Threads cudadrv cudart diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp index d9438eeb9c..9ecb1a5477 100644 --- a/source/adapters/hip/command_buffer.cpp +++ b/source/adapters/hip/command_buffer.cpp @@ -76,12 +76,12 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { ur_exp_command_buffer_command_handle_t_:: ur_exp_command_buffer_command_handle_t_( ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, - std::shared_ptr &&Node, hipKernelNodeParams Params, - uint32_t WorkDim, const size_t *GlobalWorkOffsetPtr, - const size_t *GlobalWorkSizePtr, const size_t *LocalWorkSizePtr) - : CommandBuffer(CommandBuffer), Kernel(Kernel), Node(std::move(Node)), - Params(Params), WorkDim(WorkDim), RefCountInternal(1), - RefCountExternal(1) { + hipGraphNode_t Node, hipKernelNodeParams Params, uint32_t WorkDim, + const size_t *GlobalWorkOffsetPtr, const size_t *GlobalWorkSizePtr, + const size_t *LocalWorkSizePtr, uint32_t NumKernelAlternatives, + ur_kernel_handle_t *KernelAlternatives) + : CommandBuffer(CommandBuffer), Kernel(Kernel), Node(Node), Params(Params), + WorkDim(WorkDim), RefCountInternal(1), RefCountExternal(1) { CommandBuffer->incrementInternalReferenceCount(); const size_t CopySize = sizeof(size_t) * WorkDim; @@ -99,6 +99,13 @@ ur_exp_command_buffer_command_handle_t_:: std::memset(GlobalWorkOffset + WorkDim, 0, ZeroSize); std::memset(GlobalWorkSize + WorkDim, 0, ZeroSize); } + + /* Add the default Kernel as a valid kernel handle for this command */ + ValidKernelHandles.insert(Kernel); + if (KernelAlternatives) { + ValidKernelHandles.insert(KernelAlternatives, + KernelAlternatives + NumKernelAlternatives); + } } /// Helper function for finding the HIP Nodes associated with the commands in a @@ -125,7 +132,7 @@ static ur_result_t getNodesFromSyncPoints( for (size_t i = 0; i < NumSyncPointsInWaitList; i++) { if (auto NodeHandle = SyncPoints.find(SyncPointWaitList[i]); NodeHandle != SyncPoints.end()) { - HIPNodesList.push_back(*NodeHandle->second.get()); + HIPNodesList.push_back(NodeHandle->second); } else { return UR_RESULT_ERROR_INVALID_VALUE; } @@ -139,29 +146,23 @@ static ur_result_t enqueueCommandBufferFillHelper( const hipMemoryType DstType, const void *Pattern, size_t PatternSize, size_t Size, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - ur_exp_command_buffer_sync_point_t *SyncPoint) { + ur_exp_command_buffer_sync_point_t *RetSyncPoint) { std::vector DepsList; - { - ur_result_t Result = UR_RESULT_SUCCESS; - UR_CALL(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, - SyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } + UR_CHECK_ERROR(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, + SyncPointWaitList, DepsList)); try { + // Graph node added to graph, if multiple nodes are created this will + // be set to the leaf node + hipGraphNode_t GraphNode; + const size_t N = Size / PatternSize; auto DstPtr = DstType == hipMemoryTypeDevice ? *static_cast(DstDevice) : DstDevice; if ((PatternSize == 1) || (PatternSize == 2) || (PatternSize == 4)) { - // Create a new node - hipGraphNode_t GraphNode; hipMemsetParams NodeParams = {}; NodeParams.dst = DstPtr; NodeParams.elementSize = PatternSize; @@ -192,10 +193,6 @@ static ur_result_t enqueueCommandBufferFillHelper( DepsList.data(), DepsList.size(), &NodeParams)); - // Get sync point and register the node with it. - *SyncPoint = CommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); - } else { // HIP has no memset functions that allow setting values more than 4 // bytes. UR API lets you pass an arbitrary "pattern" to the buffer @@ -206,11 +203,6 @@ static ur_result_t enqueueCommandBufferFillHelper( size_t NumberOfSteps = PatternSize / sizeof(uint8_t); - // Shared pointer that will point to the last node created - std::shared_ptr GraphNodePtr; - - // Create a new node - hipGraphNode_t GraphNodeFirst; // Update NodeParam hipMemsetParams NodeParamsStepFirst = {}; NodeParamsStepFirst.dst = DstPtr; @@ -220,16 +212,12 @@ static ur_result_t enqueueCommandBufferFillHelper( NodeParamsStepFirst.value = *(static_cast(Pattern)); NodeParamsStepFirst.width = 1; - UR_CHECK_ERROR(hipGraphAddMemsetNode( - &GraphNodeFirst, CommandBuffer->HIPGraph, DepsList.data(), - DepsList.size(), &NodeParamsStepFirst)); - - // Get sync point and register the node with it. - *SyncPoint = CommandBuffer->addSyncPoint( - std::make_shared(GraphNodeFirst)); + UR_CHECK_ERROR(hipGraphAddMemsetNode(&GraphNode, CommandBuffer->HIPGraph, + DepsList.data(), DepsList.size(), + &NodeParamsStepFirst)); DepsList.clear(); - DepsList.push_back(GraphNodeFirst); + DepsList.push_back(GraphNode); // we walk up the pattern in 1-byte steps, and add Memset node for each // 1-byte chunk of the pattern. @@ -241,8 +229,6 @@ static ur_result_t enqueueCommandBufferFillHelper( auto OffsetPtr = reinterpret_cast( reinterpret_cast(DstPtr) + (Step * sizeof(uint8_t))); - // Create a new node - hipGraphNode_t GraphNode; // Update NodeParam hipMemsetParams NodeParamsStep = {}; NodeParamsStep.dst = reinterpret_cast(OffsetPtr); @@ -256,14 +242,17 @@ static ur_result_t enqueueCommandBufferFillHelper( &GraphNode, CommandBuffer->HIPGraph, DepsList.data(), DepsList.size(), &NodeParamsStep)); - GraphNodePtr = std::make_shared(GraphNode); - // Get sync point and register the node with it. - *SyncPoint = CommandBuffer->addSyncPoint(GraphNodePtr); - DepsList.clear(); - DepsList.push_back(*GraphNodePtr.get()); + DepsList.push_back(GraphNode); } } + + // Get sync point and register the node with it. + auto SyncPoint = CommandBuffer->addSyncPoint(GraphNode); + if (RetSyncPoint) { + *RetSyncPoint = SyncPoint; + } + } catch (ur_result_t Err) { return Err; } @@ -331,61 +320,63 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numKernelAlternatives, ur_kernel_handle_t *phKernelAlternatives, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, ur_exp_command_buffer_command_handle_t *phCommand) { + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; // Preconditions + // Command handles can only be obtained from updatable command-buffers + UR_ASSERT(!(phCommand && !hCommandBuffer->IsUpdatable), + UR_RESULT_ERROR_INVALID_OPERATION); UR_ASSERT(hCommandBuffer->Context == hKernel->getContext(), UR_RESULT_ERROR_INVALID_KERNEL); UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - hipGraphNode_t GraphNode; - std::vector DepsList; + for (uint32_t i = 0; i < numKernelAlternatives; ++i) { + UR_ASSERT(phKernelAlternatives[i] != hKernel, + UR_RESULT_ERROR_INVALID_VALUE); + } - ur_result_t Result = UR_RESULT_SUCCESS; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); + try { + hipGraphNode_t GraphNode; + std::vector DepsList; - if (Result != UR_RESULT_SUCCESS) { - return Result; - } + UR_CHECK_ERROR(getNodesFromSyncPoints( + hCommandBuffer, numSyncPointsInWaitList, pSyncPointWaitList, DepsList)); - if (*pGlobalWorkSize == 0) { - try { + if (*pGlobalWorkSize == 0) { // Create an empty node if the kernel workload size is zero UR_CHECK_ERROR(hipGraphAddEmptyNode(&GraphNode, hCommandBuffer->HIPGraph, DepsList.data(), DepsList.size())); // Get sync point and register the node with it. - *pSyncPoint = hCommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); - } catch (ur_result_t Err) { - return Err; + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } + return UR_RESULT_SUCCESS; } - return UR_RESULT_SUCCESS; - } - // Set the number of threads per block to the number of threads per warp - // by default unless user has provided a better number - size_t ThreadsPerBlock[3] = {64u, 1u, 1u}; - size_t BlocksPerGrid[3] = {1u, 1u, 1u}; + // Set the number of threads per block to the number of threads per warp + // by default unless user has provided a better number + size_t ThreadsPerBlock[3] = {64u, 1u, 1u}; + size_t BlocksPerGrid[3] = {1u, 1u, 1u}; - uint32_t LocalSize = hKernel->getLocalSize(); - hipFunction_t HIPFunc = hKernel->get(); - UR_CALL(setKernelParams(hCommandBuffer->Device, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, hKernel, HIPFunc, - ThreadsPerBlock, BlocksPerGrid), - Result); - if (Result != UR_RESULT_SUCCESS) { - return Result; - } + uint32_t LocalSize = hKernel->getLocalSize(); + hipFunction_t HIPFunc = hKernel->get(); + UR_CHECK_ERROR(setKernelParams( + hCommandBuffer->Device, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, hKernel, HIPFunc, ThreadsPerBlock, BlocksPerGrid)); - try { // Set node param structure with the kernel related data auto &ArgIndices = hKernel->getArgIndices(); hipKernelNodeParams NodeParams; @@ -409,14 +400,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( hKernel->clearLocalSize(); // Get sync point and register the node with it. - auto NodeSP = std::make_shared(GraphNode); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); if (pSyncPoint) { - *pSyncPoint = hCommandBuffer->addSyncPoint(NodeSP); + *pSyncPoint = SyncPoint; } auto NewCommand = new ur_exp_command_buffer_command_handle_t_{ - hCommandBuffer, hKernel, std::move(NodeSP), NodeParams, - workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize}; + hCommandBuffer, hKernel, GraphNode, + NodeParams, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, numKernelAlternatives, + phKernelAlternatives}; NewCommand->incrementInternalReferenceCount(); hCommandBuffer->CommandHandles.push_back(NewCommand); @@ -435,32 +428,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( ur_exp_command_buffer_handle_t hCommandBuffer, void *pDst, const void *pSrc, size_t size, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + std::ignore = phCommand; hipGraphNode_t GraphNode; std::vector DepsList; UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - { - ur_result_t Result = UR_RESULT_SUCCESS; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { - UR_CHECK_ERROR(hipGraphAddMemcpyNode1D( - &GraphNode, hCommandBuffer->HIPGraph, DepsList.data(), DepsList.size(), - pDst, pSrc, size, hipMemcpyHostToHost)); + UR_CHECK_ERROR(hipGraphAddMemcpyNode1D(&GraphNode, hCommandBuffer->HIPGraph, + DepsList.data(), DepsList.size(), + pDst, pSrc, size, hipMemcpyDefault)); // Get sync point and register the node with it. - *pSyncPoint = hCommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { return Err; } @@ -472,7 +465,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( ur_mem_handle_t hDstMem, size_t srcOffset, size_t dstOffset, size_t size, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + std::ignore = phCommand; hipGraphNode_t GraphNode; std::vector DepsList; @@ -483,16 +482,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( UR_ASSERT(size + srcOffset <= std::get(hSrcMem->Mem).getSize(), UR_RESULT_ERROR_INVALID_SIZE); - { - ur_result_t Result = UR_RESULT_SUCCESS; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { auto Src = std::get(hSrcMem->Mem) @@ -505,8 +496,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( Dst, Src, size, hipMemcpyDeviceToDevice)); // Get sync point and register the node with it. - *pSyncPoint = hCommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { return Err; } @@ -520,23 +513,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + std::ignore = phCommand; hipGraphNode_t GraphNode; std::vector DepsList; UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - { - ur_result_t Result = UR_RESULT_SUCCESS; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { auto SrcPtr = @@ -554,8 +545,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( &NodeParams)); // Get sync point and register the node with it. - *pSyncPoint = hCommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { return Err; } @@ -568,23 +561,21 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( size_t offset, size_t size, const void *pSrc, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + std::ignore = phCommand; hipGraphNode_t GraphNode; std::vector DepsList; UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - { - ur_result_t Result = UR_RESULT_SUCCESS; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { auto Dst = std::get(hBuffer->Mem) @@ -595,8 +586,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( Dst, pSrc, size, hipMemcpyHostToDevice)); // Get sync point and register the node with it. - *pSyncPoint = hCommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { return Err; } @@ -608,23 +601,21 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, size_t offset, size_t size, void *pDst, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + std::ignore = phCommand; hipGraphNode_t GraphNode; std::vector DepsList; UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - { - ur_result_t Result = UR_RESULT_SUCCESS; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { auto Src = std::get(hBuffer->Mem) @@ -635,8 +626,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( pDst, Src, size, hipMemcpyDeviceToHost)); // Get sync point and register the node with it. - *pSyncPoint = hCommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { return Err; } @@ -651,23 +644,21 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + std::ignore = phCommand; hipGraphNode_t GraphNode; std::vector DepsList; UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - { - ur_result_t Result = UR_RESULT_SUCCESS; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { auto DstPtr = @@ -683,8 +674,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( &NodeParams)); // Get sync point and register the node with it. - *pSyncPoint = hCommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { return Err; } @@ -699,23 +692,21 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( size_t hostRowPitch, size_t hostSlicePitch, void *pDst, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + std::ignore = phCommand; hipGraphNode_t GraphNode; std::vector DepsList; UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - { - ur_result_t Result = UR_RESULT_SUCCESS; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { auto SrcPtr = @@ -731,8 +722,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( &NodeParams)); // Get sync point and register the node with it. - *pSyncPoint = hCommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { return Err; } @@ -744,7 +737,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( size_t /*Size*/, ur_usm_migration_flags_t /*Flags*/, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + std::ignore = phCommand; // Prefetch cmd is not supported by Hip Graph. // We implement it as an empty node to enforce dependencies. hipGraphNode_t GraphNode; @@ -753,16 +752,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - { - ur_result_t Result = UR_RESULT_SUCCESS; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { // Create an empty node if the kernel workload size is zero @@ -770,13 +761,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( DepsList.data(), DepsList.size())); // Get sync point and register the node with it. - *pSyncPoint = hCommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); - - setErrorMessage("Prefetch hint ignored and replaced with empty node as " - "prefetch is not supported by HIP Graph backend", - UR_RESULT_SUCCESS); - return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { return Err; } @@ -788,7 +776,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( size_t /*Size*/, ur_usm_advice_flags_t /*Advice*/, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + std::ignore = phCommand; // Mem-Advise cmd is not supported by Hip Graph. // We implement it as an empty node to enforce dependencies. hipGraphNode_t GraphNode; @@ -797,16 +791,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - { - ur_result_t Result = UR_RESULT_SUCCESS; - UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, - pSyncPointWaitList, DepsList), - Result); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } + UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); try { // Create an empty node if the kernel workload size is zero @@ -814,13 +800,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( DepsList.data(), DepsList.size())); // Get sync point and register the node with it. - *pSyncPoint = hCommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); - - setErrorMessage("Memory advice ignored and replaced with empty node as " - "memory advice is not supported by HIP Graph backend", - UR_RESULT_SUCCESS); - return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + auto SyncPoint = hCommandBuffer->addSyncPoint(GraphNode); + if (pSyncPoint) { + *pSyncPoint = SyncPoint; + } } catch (ur_result_t Err) { return Err; } @@ -832,7 +815,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( const void *pPattern, size_t patternSize, size_t offset, size_t size, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + std::ignore = phCommand; auto ArgsAreMultiplesOfPatternSize = (offset % patternSize == 0) || (size % patternSize == 0); @@ -859,7 +848,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( const void *pPattern, size_t patternSize, size_t size, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + std::ignore = phCommand; auto PatternIsValid = (pPattern != nullptr); @@ -878,20 +873,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - ur_result_t Result = UR_RESULT_SUCCESS; - try { std::unique_ptr RetImplEvent{nullptr}; - ScopedContext Active(hQueue->getDevice()); + ScopedDevice Active(hQueue->getDevice()); uint32_t StreamToken; ur_stream_guard Guard; hipStream_t HIPStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); - if ((Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, - phEventWaitList)) != UR_RESULT_SUCCESS) { - return Result; - } + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, + phEventWaitList)); if (phEvent) { RetImplEvent = std::unique_ptr( @@ -908,10 +899,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( *phEvent = RetImplEvent.release(); } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferRetainCommandExp( @@ -927,149 +918,173 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseCommandExp( return commandHandleReleaseInternal(hCommand); } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( - ur_exp_command_buffer_command_handle_t hCommand, - const ur_exp_command_buffer_update_kernel_launch_desc_t - *pUpdateKernelLaunch) { - // Update requires command-buffer to be finalized - ur_exp_command_buffer_handle_t CommandBuffer = hCommand->CommandBuffer; - if (!CommandBuffer->HIPGraphExec) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } +/** + * Validates contents of the update command description. + * @param[in] Command The command which is being updated. + * @param[in] UpdateCommandDesc The update command description. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t +validateCommandDesc(ur_exp_command_buffer_command_handle_t Command, + const ur_exp_command_buffer_update_kernel_launch_desc_t + *UpdateCommandDesc) { + + auto CommandBuffer = Command->CommandBuffer; - // Update requires command-buffer to be created with update enabled - if (!CommandBuffer->IsUpdatable) { + // Update requires the command-buffer to be finalized and updatable. + if (!CommandBuffer->HIPGraphExec || !CommandBuffer->IsUpdatable) { return UR_RESULT_ERROR_INVALID_OPERATION; } - if (auto NewWorkDim = pUpdateKernelLaunch->newWorkDim) { - // Error if work dim changes - if (NewWorkDim != hCommand->WorkDim) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } + if (UpdateCommandDesc->newWorkDim != Command->WorkDim && + (!UpdateCommandDesc->pNewGlobalWorkOffset || + !UpdateCommandDesc->pNewGlobalWorkSize)) { + return UR_RESULT_ERROR_INVALID_VALUE; + } - // Error If Local size and not global size - if ((pUpdateKernelLaunch->pNewLocalWorkSize != nullptr) && - (pUpdateKernelLaunch->pNewGlobalWorkSize == nullptr)) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } + if (UpdateCommandDesc->hNewKernel && + !Command->ValidKernelHandles.count(UpdateCommandDesc->hNewKernel)) { + return UR_RESULT_ERROR_INVALID_VALUE; + } - // Error if local size non-nullptr and created with null - // or if local size nullptr and created with non-null - const bool IsNewLocalSizeNull = - pUpdateKernelLaunch->pNewLocalWorkSize == nullptr; - const bool IsOriginalLocalSizeNull = hCommand->isNullLocalSize(); + return UR_RESULT_SUCCESS; +} - if (IsNewLocalSizeNull ^ IsOriginalLocalSizeNull) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } - } +/** + * Updates the arguments of CommandDesc->hNewKernel + * @param[in] Device The device associated with the kernel being updated. + * @param[in] UpdateCommandDesc The update command description that contains + * the new kernel and its arguments. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t +updateKernelArguments(ur_device_handle_t Device, + const ur_exp_command_buffer_update_kernel_launch_desc_t + *UpdateCommandDesc) { - // Kernel corresponding to the command to update - ur_kernel_handle_t Kernel = hCommand->Kernel; - ur_device_handle_t Device = CommandBuffer->Device; + ur_kernel_handle_t NewKernel = UpdateCommandDesc->hNewKernel; // Update pointer arguments to the kernel - uint32_t NumPointerArgs = pUpdateKernelLaunch->numNewPointerArgs; + uint32_t NumPointerArgs = UpdateCommandDesc->numNewPointerArgs; const ur_exp_command_buffer_update_pointer_arg_desc_t *ArgPointerList = - pUpdateKernelLaunch->pNewPointerArgList; + UpdateCommandDesc->pNewPointerArgList; for (uint32_t i = 0; i < NumPointerArgs; i++) { const auto &PointerArgDesc = ArgPointerList[i]; uint32_t ArgIndex = PointerArgDesc.argIndex; const void *ArgValue = PointerArgDesc.pNewPointerArg; - ur_result_t Result = UR_RESULT_SUCCESS; try { - Kernel->setKernelArg(ArgIndex, sizeof(ArgValue), ArgValue); + NewKernel->setKernelArg(ArgIndex, sizeof(ArgValue), ArgValue); } catch (ur_result_t Err) { - Result = Err; - return Result; + return Err; } } // Update memobj arguments to the kernel - uint32_t NumMemobjArgs = pUpdateKernelLaunch->numNewMemObjArgs; + uint32_t NumMemobjArgs = UpdateCommandDesc->numNewMemObjArgs; const ur_exp_command_buffer_update_memobj_arg_desc_t *ArgMemobjList = - pUpdateKernelLaunch->pNewMemObjArgList; + UpdateCommandDesc->pNewMemObjArgList; for (uint32_t i = 0; i < NumMemobjArgs; i++) { const auto &MemobjArgDesc = ArgMemobjList[i]; uint32_t ArgIndex = MemobjArgDesc.argIndex; ur_mem_handle_t ArgValue = MemobjArgDesc.hNewMemObjArg; - ur_result_t Result = UR_RESULT_SUCCESS; try { if (ArgValue == nullptr) { - Kernel->setKernelArg(ArgIndex, 0, nullptr); + NewKernel->setKernelArg(ArgIndex, 0, nullptr); } else { void *HIPPtr = std::get(ArgValue->Mem).getVoid(Device); - Kernel->setKernelArg(ArgIndex, sizeof(void *), (void *)&HIPPtr); + NewKernel->setKernelArg(ArgIndex, sizeof(void *), (void *)&HIPPtr); } } catch (ur_result_t Err) { - Result = Err; - return Result; + return Err; } } // Update value arguments to the kernel - uint32_t NumValueArgs = pUpdateKernelLaunch->numNewValueArgs; + uint32_t NumValueArgs = UpdateCommandDesc->numNewValueArgs; const ur_exp_command_buffer_update_value_arg_desc_t *ArgValueList = - pUpdateKernelLaunch->pNewValueArgList; + UpdateCommandDesc->pNewValueArgList; for (uint32_t i = 0; i < NumValueArgs; i++) { const auto &ValueArgDesc = ArgValueList[i]; uint32_t ArgIndex = ValueArgDesc.argIndex; size_t ArgSize = ValueArgDesc.argSize; const void *ArgValue = ValueArgDesc.pNewValueArg; - ur_result_t Result = UR_RESULT_SUCCESS; - try { - Kernel->setKernelArg(ArgIndex, ArgSize, ArgValue); + NewKernel->setKernelArg(ArgIndex, ArgSize, ArgValue); } catch (ur_result_t Err) { - Result = Err; - return Result; + return Err; } } - // Set the updated ND range - const uint32_t NewWorkDim = pUpdateKernelLaunch->newWorkDim; - if (NewWorkDim != 0) { - UR_ASSERT(NewWorkDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - UR_ASSERT(NewWorkDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - hCommand->WorkDim = NewWorkDim; + return UR_RESULT_SUCCESS; +} + +/** + * Updates the command buffer command with new values from the update + * description. + * @param[in] Command The command to be updated. + * @param[in] UpdateCommandDesc The update command description. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t +updateCommand(ur_exp_command_buffer_command_handle_t Command, + const ur_exp_command_buffer_update_kernel_launch_desc_t + *UpdateCommandDesc) { + + if (UpdateCommandDesc->hNewKernel) { + Command->Kernel = UpdateCommandDesc->hNewKernel; + } + + if (UpdateCommandDesc->hNewKernel) { + Command->WorkDim = UpdateCommandDesc->newWorkDim; } - if (pUpdateKernelLaunch->pNewGlobalWorkOffset) { - hCommand->setGlobalOffset(pUpdateKernelLaunch->pNewGlobalWorkOffset); + if (UpdateCommandDesc->pNewGlobalWorkOffset) { + Command->setGlobalOffset(UpdateCommandDesc->pNewGlobalWorkOffset); } - if (pUpdateKernelLaunch->pNewGlobalWorkSize) { - hCommand->setGlobalSize(pUpdateKernelLaunch->pNewGlobalWorkSize); + if (UpdateCommandDesc->pNewGlobalWorkSize) { + Command->setGlobalSize(UpdateCommandDesc->pNewGlobalWorkSize); + if (!UpdateCommandDesc->pNewLocalWorkSize) { + Command->setNullLocalSize(); + } } - if (pUpdateKernelLaunch->pNewLocalWorkSize) { - hCommand->setLocalSize(pUpdateKernelLaunch->pNewLocalWorkSize); + if (UpdateCommandDesc->pNewLocalWorkSize) { + Command->setLocalSize(UpdateCommandDesc->pNewLocalWorkSize); } - size_t *GlobalWorkOffset = hCommand->GlobalWorkOffset; - size_t *GlobalWorkSize = hCommand->GlobalWorkSize; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( + ur_exp_command_buffer_command_handle_t hCommand, + const ur_exp_command_buffer_update_kernel_launch_desc_t + *pUpdateKernelLaunch) { + + ur_exp_command_buffer_handle_t CommandBuffer = hCommand->CommandBuffer; + + UR_CHECK_ERROR(validateCommandDesc(hCommand, pUpdateKernelLaunch)); + UR_CHECK_ERROR( + updateKernelArguments(CommandBuffer->Device, pUpdateKernelLaunch)); + UR_CHECK_ERROR(updateCommand(hCommand, pUpdateKernelLaunch)); - // If no worksize is provided make sure we pass nullptr to setKernelParams so - // it can guess the local work size. + // If no worksize is provided make sure we pass nullptr to setKernelParams + // so it can guess the local work size. const bool ProvidedLocalSize = !hCommand->isNullLocalSize(); size_t *LocalWorkSize = ProvidedLocalSize ? hCommand->LocalWorkSize : nullptr; - uint32_t WorkDim = hCommand->WorkDim; // Set the number of threads per block to the number of threads per warp // by default unless user has provided a better number size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; size_t BlocksPerGrid[3] = {1u, 1u, 1u}; - hipFunction_t HIPFunc = Kernel->get(); - auto Result = setKernelParams(Device, WorkDim, GlobalWorkOffset, - GlobalWorkSize, LocalWorkSize, Kernel, HIPFunc, - ThreadsPerBlock, BlocksPerGrid); - if (Result != UR_RESULT_SUCCESS) { - return Result; - } + hipFunction_t HIPFunc = hCommand->Kernel->get(); + UR_CHECK_ERROR(setKernelParams( + CommandBuffer->Device, hCommand->WorkDim, hCommand->GlobalWorkOffset, + hCommand->GlobalWorkSize, LocalWorkSize, hCommand->Kernel, HIPFunc, + ThreadsPerBlock, BlocksPerGrid)); hipKernelNodeParams &Params = hCommand->Params; @@ -1080,15 +1095,33 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( Params.blockDim.x = ThreadsPerBlock[0]; Params.blockDim.y = ThreadsPerBlock[1]; Params.blockDim.z = ThreadsPerBlock[2]; - Params.sharedMemBytes = Kernel->getLocalSize(); - Params.kernelParams = const_cast(Kernel->getArgIndices().data()); + Params.sharedMemBytes = hCommand->Kernel->getLocalSize(); + Params.kernelParams = + const_cast(hCommand->Kernel->getArgIndices().data()); - hipGraphNode_t Node = *(hCommand->Node); + hipGraphNode_t Node = hCommand->Node; hipGraphExec_t HipGraphExec = CommandBuffer->HIPGraphExec; UR_CHECK_ERROR(hipGraphExecKernelNodeSetParams(HipGraphExec, Node, &Params)); return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateSignalEventExp( + ur_exp_command_buffer_command_handle_t hCommand, + ur_event_handle_t *phEvent) { + std::ignore = hCommand; + std::ignore = phEvent; + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateWaitEventsExp( + ur_exp_command_buffer_command_handle_t hCommand, + uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList) { + std::ignore = hCommand; + std::ignore = NumEventsInWaitList; + std::ignore = phEventWaitList; + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferGetInfoExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_exp_command_buffer_info_t propName, size_t propSize, void *pPropValue, @@ -1098,6 +1131,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferGetInfoExp( switch (propName) { case UR_EXP_COMMAND_BUFFER_INFO_REFERENCE_COUNT: return ReturnValue(hCommandBuffer->getExternalReferenceCount()); + case UR_EXP_COMMAND_BUFFER_INFO_DESCRIPTOR: { + ur_exp_command_buffer_desc_t Descriptor{}; + Descriptor.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC; + Descriptor.pNext = nullptr; + Descriptor.isUpdatable = hCommandBuffer->IsUpdatable; + Descriptor.isInOrder = false, Descriptor.enableProfiling = false; + + return ReturnValue(Descriptor); + } default: assert(!"Command-buffer info request not implemented"); } diff --git a/source/adapters/hip/command_buffer.hpp b/source/adapters/hip/command_buffer.hpp index 751fde3720..e162b8e640 100644 --- a/source/adapters/hip/command_buffer.hpp +++ b/source/adapters/hip/command_buffer.hpp @@ -15,6 +15,7 @@ #include "context.hpp" #include #include +#include // Trace an internal UR call #define UR_TRACE(Call) \ @@ -41,9 +42,10 @@ struct ur_exp_command_buffer_command_handle_t_ { ur_exp_command_buffer_command_handle_t_( ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, - std::shared_ptr &&Node, hipKernelNodeParams Params, - uint32_t WorkDim, const size_t *GlobalWorkOffsetPtr, - const size_t *GlobalWorkSizePtr, const size_t *LocalWorkSizePtr); + hipGraphNode_t Node, hipKernelNodeParams Params, uint32_t WorkDim, + const size_t *GlobalWorkOffsetPtr, const size_t *GlobalWorkSizePtr, + const size_t *LocalWorkSizePtr, uint32_t NumKernelAlternatives, + ur_kernel_handle_t *KernelAlternatives); void setGlobalOffset(const size_t *GlobalWorkOffsetPtr) { const size_t CopySize = sizeof(size_t) * WorkDim; @@ -72,6 +74,10 @@ struct ur_exp_command_buffer_command_handle_t_ { } } + void setNullLocalSize() noexcept { + std::memset(LocalWorkSize, 0, sizeof(size_t) * 3); + } + bool isNullLocalSize() const noexcept { const size_t Zeros[3] = {0, 0, 0}; return 0 == std::memcmp(LocalWorkSize, Zeros, sizeof(LocalWorkSize)); @@ -95,8 +101,14 @@ struct ur_exp_command_buffer_command_handle_t_ { } ur_exp_command_buffer_handle_t CommandBuffer; + + // The currently active kernel handle for this command. ur_kernel_handle_t Kernel; - std::shared_ptr Node; + + // Set of all the kernel handles that can be used when updating this command. + std::unordered_set ValidKernelHandles; + + hipGraphNode_t Node; hipKernelNodeParams Params; uint32_t WorkDim; @@ -117,7 +129,7 @@ struct ur_exp_command_buffer_handle_t_ { ~ur_exp_command_buffer_handle_t_(); void registerSyncPoint(ur_exp_command_buffer_sync_point_t SyncPoint, - std::shared_ptr &&HIPNode) { + hipGraphNode_t HIPNode) { SyncPoints[SyncPoint] = std::move(HIPNode); NextSyncPoint++; } @@ -129,8 +141,7 @@ struct ur_exp_command_buffer_handle_t_ { // Helper to register next sync point // @param HIPNode Node to register as next sync point // @return Pointer to the sync that registers the Node - ur_exp_command_buffer_sync_point_t - addSyncPoint(std::shared_ptr HIPNode) { + ur_exp_command_buffer_sync_point_t addSyncPoint(hipGraphNode_t HIPNode) { ur_exp_command_buffer_sync_point_t SyncPoint = NextSyncPoint; registerSyncPoint(SyncPoint, std::move(HIPNode)); return SyncPoint; @@ -171,8 +182,7 @@ struct ur_exp_command_buffer_handle_t_ { std::atomic_uint32_t RefCountExternal; // Map of sync_points to ur_events - std::unordered_map> + std::unordered_map SyncPoints; // Next sync_point value (may need to consider ways to reuse values if 32-bits // is not enough) diff --git a/source/adapters/hip/context.cpp b/source/adapters/hip/context.cpp index b0733a236d..761eab954d 100644 --- a/source/adapters/hip/context.cpp +++ b/source/adapters/hip/context.cpp @@ -32,10 +32,7 @@ ur_context_handle_t_::getOwningURPool(umf_memory_pool_t *UMFPool) { return nullptr; } -/// Create a UR HIP context. -/// -/// By default creates a scoped context and keeps the last active HIP context -/// on top of the HIP context stack. +/// Create a UR context. /// UR_APIEXPORT ur_result_t UR_APICALL urContextCreate( uint32_t DeviceCount, const ur_device_handle_t *phDevices, @@ -44,7 +41,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextCreate( std::unique_ptr ContextPtr{nullptr}; try { - // Create a scoped context. + // Create a context. ContextPtr = std::unique_ptr( new ur_context_handle_t_{phDevices, DeviceCount}); *phContext = ContextPtr.release(); @@ -111,13 +108,15 @@ urContextRetain(ur_context_handle_t hContext) { return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urContextGetNativeHandle( - ur_context_handle_t hContext, ur_native_handle_t *phNativeContext) { - // FIXME: this entry point has been deprecated in the SYCL RT and should be - // changed to unsupported once the deprecation period has elapsed - *phNativeContext = reinterpret_cast( - hContext->getDevices()[0]->getNativeContext()); - return UR_RESULT_SUCCESS; +// urContextGetNativeHandle should not be implemented in the HIP backend. +// hipCtx_t is not natively supported by amd devices, and more importantly does +// not map to ur_context_handle_t in any way. +UR_APIEXPORT ur_result_t UR_APICALL +urContextGetNativeHandle([[maybe_unused]] ur_context_handle_t hContext, + [[maybe_unused]] ur_native_handle_t *phNativeContext) { + std::ignore = hContext; + std::ignore = phNativeContext; + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle( diff --git a/source/adapters/hip/context.hpp b/source/adapters/hip/context.hpp index 90366436e2..5af95753b8 100644 --- a/source/adapters/hip/context.hpp +++ b/source/adapters/hip/context.hpp @@ -57,6 +57,15 @@ typedef void (*ur_context_extended_deleter_t)(void *UserData); /// See proposal for details. /// https://github.com/codeplaysoftware/standards-proposals/blob/master/extended-context-destruction/index.md /// +/// +/// Destructor callback +/// +/// Required to implement CP023, SYCL Extended Context Destruction, +/// the UR Context can store a number of callback functions that will be +/// called upon destruction of the UR Context. +/// See proposal for details. +/// https://github.com/codeplaysoftware/standards-proposals/blob/master/extended-context-destruction/index.md +/// /// Memory Management for Devices in a Context <\b> /// /// A \c ur_mem_handle_t is associated with a \c ur_context_handle_t_, which @@ -76,8 +85,6 @@ struct ur_context_handle_t_ { void operator()() { Function(UserData); } }; - using native_type = hipCtx_t; - std::vector Devices; std::atomic_uint32_t RefCount; @@ -89,11 +96,7 @@ struct ur_context_handle_t_ { } }; - ~ur_context_handle_t_() { - for (auto &Dev : Devices) { - urDeviceRelease(Dev); - } - } + ~ur_context_handle_t_() {} void invokeExtendedDeleters() { std::lock_guard Guard(Mutex); @@ -136,28 +139,3 @@ struct ur_context_handle_t_ { std::vector ExtendedDeleters; std::set PoolHandles; }; - -namespace { -/// Scoped context is used across all UR HIP plugin implementation to activate -/// the native Context on the current thread. The ScopedContext does not -/// reinstate the previous context as all operations in the hip adapter that -/// require an active context, set the active context and don't rely on context -/// reinstation -class ScopedContext { -public: - ScopedContext(ur_device_handle_t hDevice) { - hipCtx_t Original{}; - - if (!hDevice) { - throw UR_RESULT_ERROR_INVALID_DEVICE; - } - - hipCtx_t Desired = hDevice->getNativeContext(); - UR_CHECK_ERROR(hipCtxGetCurrent(&Original)); - if (Original != Desired) { - // Sets the desired context as the active one for the thread - UR_CHECK_ERROR(hipCtxSetCurrent(Desired)); - } - } -}; -} // namespace diff --git a/source/adapters/hip/device.cpp b/source/adapters/hip/device.cpp index da92fa6a87..5d6ca49e97 100644 --- a/source/adapters/hip/device.cpp +++ b/source/adapters/hip/device.cpp @@ -223,7 +223,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(uint64_t{MaxAlloc}); } case UR_DEVICE_INFO_IMAGE_SUPPORTED: { - return ReturnValue(true); + return ReturnValue(ur_bool_t{true}); } case UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS: { // This call doesn't match to HIP as it doesn't have images, but instead @@ -458,19 +458,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(size_t(1000)); } case UR_DEVICE_INFO_ENDIAN_LITTLE: { - return ReturnValue(true); + return ReturnValue(ur_bool_t{true}); } case UR_DEVICE_INFO_AVAILABLE: { - return ReturnValue(true); + return ReturnValue(ur_bool_t{true}); } case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE: { - return ReturnValue(true); + return ReturnValue(ur_bool_t{true}); } case UR_DEVICE_INFO_COMPILER_AVAILABLE: { - return ReturnValue(true); + return ReturnValue(ur_bool_t{true}); } case UR_DEVICE_INFO_LINKER_AVAILABLE: { - return ReturnValue(true); + return ReturnValue(ur_bool_t{true}); } case UR_DEVICE_INFO_EXECUTION_CAPABILITIES: { auto Capability = ur_device_exec_capability_flags_t{ @@ -548,7 +548,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // postprocessing is NOP. HIP 4.3 docs indicate support for // native asserts are in progress std::string SupportedExtensions = ""; - SupportedExtensions += "pi_ext_intel_devicelib_assert "; + SupportedExtensions += "cl_intel_devicelib_assert "; SupportedExtensions += "ur_exp_usm_p2p "; int RuntimeVersion = 0; @@ -583,7 +583,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(size_t(1024)); } case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: { - return ReturnValue(true); + return ReturnValue(ur_bool_t{true}); } case UR_DEVICE_INFO_PARENT_DEVICE: { return ReturnValue(nullptr); @@ -857,9 +857,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(static_cast(MaxRegisters)); } case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT: - return ReturnValue(false); + return ReturnValue(ur_bool_t{false}); case UR_DEVICE_INFO_IMAGE_SRGB: - return ReturnValue(false); + return ReturnValue(ur_bool_t{false}); case UR_DEVICE_INFO_PCI_ADDRESS: { constexpr size_t AddressBufferSize = 13; char AddressBuffer[AddressBufferSize]; @@ -875,20 +875,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, strnlen(AddressBuffer, AddressBufferSize - 1) + 1); } case UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED: - return ReturnValue(false); + return ReturnValue(ur_bool_t{false}); case UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT: - return ReturnValue(false); + return ReturnValue(ur_bool_t{false}); case UR_DEVICE_INFO_ESIMD_SUPPORT: - return ReturnValue(false); + return ReturnValue(ur_bool_t{false}); case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: - return ReturnValue(true); + return ReturnValue(ur_bool_t{true}); case UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP: { // HIP supports enqueueing native work through the urNativeEnqueueExp - return ReturnValue(true); + return ReturnValue(ur_bool_t{true}); } case UR_DEVICE_INFO_GLOBAL_VARIABLE_SUPPORT: - return ReturnValue(false); + return ReturnValue(ur_bool_t{false}); // TODO: Investigate if this information is available on HIP. case UR_DEVICE_INFO_COMPONENT_DEVICES: case UR_DEVICE_INFO_COMPOSITE_DEVICE: @@ -903,9 +903,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_IL_VERSION: case UR_DEVICE_INFO_ASYNC_BARRIER: return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; - - case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: - case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: { + case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: { int DriverVersion = 0; UR_CHECK_ERROR(hipDriverGetVersion(&DriverVersion)); @@ -917,6 +915,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, const int CmdBufDriverMinVersion = 50530202; // ROCM 5.5.1 return ReturnValue(DriverVersion >= CmdBufDriverMinVersion); } + case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP: { + int DriverVersion = 0; + UR_CHECK_ERROR(hipDriverGetVersion(&DriverVersion)); + const int CmdBufDriverMinVersion = 50530202; // ROCM 5.5.1 + if (DriverVersion < CmdBufDriverMinVersion) { + return ReturnValue( + static_cast(0)); + } + ur_device_command_buffer_update_capability_flags_t UpdateCapabilities = + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE; + return ReturnValue(UpdateCapabilities); + } + case UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP: + return ReturnValue(false); default: break; } @@ -988,7 +1004,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( } UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( - ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform, + ur_native_handle_t hNativeDevice, + [[maybe_unused]] ur_adapter_handle_t hAdapter, [[maybe_unused]] const ur_device_native_properties_t *pProperties, ur_device_handle_t *phDevice) { // We can't cast between ur_native_handle_t and hipDevice_t, so memcpy the @@ -1000,16 +1017,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( return Dev->get() == HIPDevice; }; - // If a platform is provided just check if the device is in it - if (hPlatform) { - auto SearchRes = std::find_if(begin(hPlatform->Devices), - end(hPlatform->Devices), IsDevice); - if (SearchRes != end(hPlatform->Devices)) { - *phDevice = SearchRes->get(); - return UR_RESULT_SUCCESS; - } - } - // Get list of platforms uint32_t NumPlatforms = 0; ur_adapter_handle_t AdapterHandle = &adapter; @@ -1077,7 +1084,7 @@ ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice, return UR_RESULT_SUCCESS; ur_event_handle_t_::native_type Event; - ScopedContext Active(hDevice); + ScopedDevice Active(hDevice); if (pDeviceTimestamp) { UR_CHECK_ERROR(hipEventCreateWithFlags(&Event, hipEventDefault)); diff --git a/source/adapters/hip/device.hpp b/source/adapters/hip/device.hpp index 5fd11bfc2f..bd2b6002e0 100644 --- a/source/adapters/hip/device.hpp +++ b/source/adapters/hip/device.hpp @@ -24,7 +24,6 @@ struct ur_device_handle_t_ { native_type HIPDevice; std::atomic_uint32_t RefCount; ur_platform_handle_t Platform; - hipCtx_t HIPContext; hipEvent_t EvBase; // HIP event used as base counter uint32_t DeviceIndex; @@ -37,11 +36,10 @@ struct ur_device_handle_t_ { int ConcurrentManagedAccess{0}; public: - ur_device_handle_t_(native_type HipDevice, hipCtx_t Context, - hipEvent_t EvBase, ur_platform_handle_t Platform, - uint32_t DeviceIndex) - : HIPDevice(HipDevice), RefCount{1}, Platform(Platform), - HIPContext(Context), EvBase(EvBase), DeviceIndex(DeviceIndex) { + ur_device_handle_t_(native_type HipDevice, hipEvent_t EvBase, + ur_platform_handle_t Platform, uint32_t DeviceIndex) + : HIPDevice(HipDevice), RefCount{1}, Platform(Platform), EvBase(EvBase), + DeviceIndex(DeviceIndex) { UR_CHECK_ERROR(hipDeviceGetAttribute( &MaxWorkGroupSize, hipDeviceAttributeMaxThreadsPerBlock, HIPDevice)); @@ -61,9 +59,7 @@ struct ur_device_handle_t_ { HIPDevice)); } - ~ur_device_handle_t_() noexcept(false) { - UR_CHECK_ERROR(hipDevicePrimaryCtxRelease(HIPDevice)); - } + ~ur_device_handle_t_() noexcept(false) {} native_type get() const noexcept { return HIPDevice; }; @@ -73,8 +69,6 @@ struct ur_device_handle_t_ { uint64_t getElapsedTime(hipEvent_t) const; - hipCtx_t getNativeContext() const noexcept { return HIPContext; }; - // Returns the index of the device relative to the other devices in the same // platform uint32_t getIndex() const noexcept { return DeviceIndex; }; @@ -97,3 +91,20 @@ struct ur_device_handle_t_ { }; int getAttribute(ur_device_handle_t Device, hipDeviceAttribute_t Attribute); + +namespace { +/// Scoped Device is used across all UR HIP plugin implementation to activate +/// the native Device on the current thread. The ScopedDevice does not +/// reinstate the previous device as all operations in the HIP adapter that +/// require an active device, set the active device and don't rely on device +/// reinstation +class ScopedDevice { +public: + ScopedDevice(ur_device_handle_t hDevice) { + if (!hDevice) { + throw UR_RESULT_ERROR_INVALID_DEVICE; + } + UR_CHECK_ERROR(hipSetDevice(hDevice->getIndex())); + } +}; +} // namespace diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index 99f23a30a4..293f3eea7a 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -28,27 +28,23 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t Queue, hipStream_t Stream, return UR_RESULT_SUCCESS; } try { - auto Result = forLatestEvents( + UR_CHECK_ERROR(forLatestEvents( EventWaitList, NumEventsInWaitList, [Stream, Queue](ur_event_handle_t Event) -> ur_result_t { - ScopedContext Active(Queue->getDevice()); + ScopedDevice Active(Queue->getDevice()); if (Event->isCompleted() || Event->getStream() == Stream) { return UR_RESULT_SUCCESS; } else { UR_CHECK_ERROR(hipStreamWaitEvent(Stream, Event->get(), 0)); return UR_RESULT_SUCCESS; } - }); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - return UR_RESULT_SUCCESS; + })); } catch (ur_result_t Err) { return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } + return UR_RESULT_SUCCESS; } // Determine local work sizes that result in uniform work groups. @@ -164,7 +160,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( hBuffer->setLastQueueWritingToMemObj(hQueue); try { - ScopedContext Active(hQueue->getDevice()); + ScopedDevice Active(hQueue->getDevice()); hipStream_t HIPStream = hQueue->getNextTransferStream(); UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, phEventWaitList)); @@ -220,7 +216,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( } auto Device = hQueue->getDevice(); - ScopedContext Active(Device); + ScopedDevice Active(Device); hipStream_t HIPStream = hQueue->getNextTransferStream(); // Use the default stream if copying from another device @@ -290,7 +286,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( pGlobalWorkSize, pLocalWorkSize, hKernel, HIPFunc, ThreadsPerBlock, BlocksPerGrid)); - ScopedContext Active(Dev); + ScopedDevice Active(Dev); uint32_t StreamToken; ur_stream_guard Guard; @@ -378,7 +374,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST) try { - ScopedContext Active(hQueue->getDevice()); + ScopedDevice Active(hQueue->getDevice()); uint32_t StreamToken; ur_stream_guard Guard; hipStream_t HIPStream = hQueue->getNextComputeStream( @@ -533,7 +529,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( } auto Device = hQueue->getDevice(); - ScopedContext Active(Device); + ScopedDevice Active(Device); hipStream_t HIPStream = hQueue->getNextTransferStream(); UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, @@ -582,7 +578,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( hBuffer->setLastQueueWritingToMemObj(hQueue); try { - ScopedContext Active(hQueue->getDevice()); + ScopedDevice Active(hQueue->getDevice()); hipStream_t HIPStream = hQueue->getNextTransferStream(); UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, phEventWaitList)); @@ -629,13 +625,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext Active(hQueue->getDevice()); - ur_result_t Result = UR_RESULT_SUCCESS; + ScopedDevice Active(hQueue->getDevice()); auto Stream = hQueue->getNextTransferStream(); if (phEventWaitList) { - Result = enqueueEventsWait(hQueue, Stream, numEventsInWaitList, - phEventWaitList); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList, + phEventWaitList)); } if (phEvent) { @@ -657,12 +652,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( *phEvent = RetImplEvent.release(); } - return Result; } catch (ur_result_t Err) { return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( @@ -672,7 +667,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - ur_result_t Result = UR_RESULT_SUCCESS; void *SrcPtr = std::get(hBufferSrc->Mem).getVoid(hQueue->getDevice()); void *DstPtr = @@ -680,10 +674,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext Active(hQueue->getDevice()); + ScopedDevice Active(hQueue->getDevice()); hipStream_t HIPStream = hQueue->getNextTransferStream(); - Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, - phEventWaitList); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, + phEventWaitList)); if (phEvent) { RetImplEvent = @@ -692,10 +686,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( UR_CHECK_ERROR(RetImplEvent->start()); } - Result = commonEnqueueMemBufferCopyRect( + UR_CHECK_ERROR(commonEnqueueMemBufferCopyRect( HIPStream, region, &SrcPtr, hipMemoryTypeDevice, srcOrigin, srcRowPitch, srcSlicePitch, &DstPtr, hipMemoryTypeDevice, dstOrigin, dstRowPitch, - dstSlicePitch); + dstSlicePitch)); if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); @@ -703,9 +697,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } static inline void memsetRemainPattern(hipStream_t Stream, uint32_t PatternSize, @@ -794,7 +788,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( hBuffer->setLastQueueWritingToMemObj(hQueue); try { - ScopedContext Active(hQueue->getDevice()); + ScopedDevice Active(hQueue->getDevice()); auto Stream = hQueue->getNextTransferStream(); if (phEventWaitList) { @@ -941,7 +935,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( } auto Device = hQueue->getDevice(); - ScopedContext Active(Device); + ScopedDevice Active(Device); hipStream_t HIPStream = hQueue->getNextTransferStream(); if (phEventWaitList) { @@ -1001,7 +995,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( UR_ASSERT(hImage->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); try { - ScopedContext Active(hQueue->getDevice()); + ScopedDevice Active(hQueue->getDevice()); hipStream_t HIPStream = hQueue->getNextTransferStream(); if (phEventWaitList) { @@ -1063,14 +1057,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( std::get(hImageDst->Mem).getImageType(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); - ur_result_t Result = UR_RESULT_SUCCESS; - try { - ScopedContext Active(hQueue->getDevice()); + ScopedDevice Active(hQueue->getDevice()); hipStream_t HIPStream = hQueue->getNextTransferStream(); if (phEventWaitList) { - Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, - phEventWaitList); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, + phEventWaitList)); } hipArray *SrcArray = @@ -1110,13 +1102,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( UR_CHECK_ERROR(RetImplEvent->start()); } - Result = commonEnqueueMemImageNDCopy( + UR_CHECK_ERROR(commonEnqueueMemImageNDCopy( HIPStream, ImgType, AdjustedRegion, SrcArray, hipMemoryTypeArray, - SrcOffset, DstArray, hipMemoryTypeArray, DstOffset); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } + SrcOffset, DstArray, hipMemoryTypeArray, DstOffset)); if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); @@ -1161,7 +1149,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( hQueue, hBuffer, blockingMap, offset, size, MapPtr, numEventsInWaitList, phEventWaitList, phEvent)); } else { - ScopedContext Active(hQueue->getDevice()); + ScopedDevice Active(hQueue->getDevice()); if (IsPinned) { UR_CHECK_ERROR(urEnqueueEventsWait(hQueue, numEventsInWaitList, @@ -1211,7 +1199,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( hQueue, hMem, true, Map->getMapOffset(), Map->getMapSize(), pMappedPtr, numEventsInWaitList, phEventWaitList, phEvent)); } else { - ScopedContext Active(hQueue->getDevice()); + ScopedDevice Active(hQueue->getDevice()); if (IsPinned) { UR_CHECK_ERROR(urEnqueueEventsWait(hQueue, numEventsInWaitList, @@ -1237,17 +1225,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( ur_queue_handle_t hQueue, void *ptr, size_t patternSize, const void *pPattern, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - ur_result_t Result = UR_RESULT_SUCCESS; std::unique_ptr EventPtr{nullptr}; try { - ScopedContext Active(hQueue->getDevice()); + ScopedDevice Active(hQueue->getDevice()); uint32_t StreamToken; ur_stream_guard Guard; hipStream_t HIPStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); - Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, - phEventWaitList); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, + phEventWaitList)); if (phEvent) { EventPtr = std::unique_ptr(ur_event_handle_t_::makeNative( @@ -1274,8 +1261,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( break; default: - Result = commonMemSetLargePattern(HIPStream, patternSize, size, pPattern, - reinterpret_cast(ptr)); + UR_CHECK_ERROR( + commonMemSetLargePattern(HIPStream, patternSize, size, pPattern, + reinterpret_cast(ptr))); break; } @@ -1284,25 +1272,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( *phEvent = EventPtr.release(); } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( ur_queue_handle_t hQueue, bool blocking, void *pDst, const void *pSrc, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - ur_result_t Result = UR_RESULT_SUCCESS; - std::unique_ptr EventPtr{nullptr}; try { - ScopedContext Active(hQueue->getDevice()); + ScopedDevice Active(hQueue->getDevice()); hipStream_t HIPStream = hQueue->getNextTransferStream(); - Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, - phEventWaitList); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, + phEventWaitList)); if (phEvent) { EventPtr = std::unique_ptr(ur_event_handle_t_::makeNative( @@ -1321,9 +1307,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( *phEvent = EventPtr.release(); } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( @@ -1345,13 +1331,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE); #endif - ur_result_t Result = UR_RESULT_SUCCESS; - try { - ScopedContext Active(hQueue->getDevice()); + ScopedDevice Active(hQueue->getDevice()); hipStream_t HIPStream = hQueue->getNextTransferStream(); - Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, - phEventWaitList); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, + phEventWaitList)); std::unique_ptr EventPtr{nullptr}; @@ -1399,10 +1383,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( hipMemPrefetchAsync(pMem, size, hQueue->getDevice()->get(), HIPStream)); releaseEvent(); } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } /// USM: memadvise API to govern behavior of automatic migration mechanisms @@ -1425,7 +1409,7 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, #endif try { - ScopedContext Active(Device); + ScopedDevice Active(Device); std::unique_ptr EventPtr{nullptr}; if (phEvent) { @@ -1521,6 +1505,7 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, UR_RESULT_SUCCESS); return UR_RESULT_ERROR_ADAPTER_SPECIFIC; } + UR_CHECK_ERROR(Result); } releaseEvent(); @@ -1558,13 +1543,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( const void *pSrc, size_t srcPitch, size_t width, size_t height, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - ur_result_t Result = UR_RESULT_SUCCESS; - try { - ScopedContext Active(hQueue->getDevice()); + ScopedDevice Active(hQueue->getDevice()); hipStream_t HIPStream = hQueue->getNextTransferStream(); - Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, - phEventWaitList); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, + phEventWaitList)); std::unique_ptr RetImplEvent{nullptr}; if (phEvent) { @@ -1668,10 +1651,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( UR_CHECK_ERROR(hipStreamSynchronize(HIPStream)); } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } namespace { @@ -1762,7 +1745,7 @@ setKernelParams(const ur_device_handle_t Device, const uint32_t WorkDim, size_t MaxWorkGroupSize = 0; ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext Active(Device); + ScopedDevice Active(Device); { size_t MaxThreadsPerBlock[3] = { static_cast(Device->getMaxBlockDimX()), @@ -1906,7 +1889,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( ur_result_t Result = UR_RESULT_SUCCESS; std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext Active(hQueue->getDevice()); + ScopedDevice Active(hQueue->getDevice()); uint32_t StreamToken; ur_stream_guard Guard; diff --git a/source/adapters/hip/enqueue_native.cpp b/source/adapters/hip/enqueue_native.cpp index 1ad6bbe2c0..ee171c1725 100644 --- a/source/adapters/hip/enqueue_native.cpp +++ b/source/adapters/hip/enqueue_native.cpp @@ -27,7 +27,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( // sure memory migration happens across devices in the same context try { - ScopedContext ActiveContext(hQueue->getDevice()); + ScopedDevice ActiveDevice(hQueue->getDevice()); ScopedStream ActiveStream(hQueue, NumEventsInWaitList, phEventWaitList); std::unique_ptr RetImplEvent{nullptr}; diff --git a/source/adapters/hip/event.cpp b/source/adapters/hip/event.cpp index dbf1d331ee..81c839cf32 100644 --- a/source/adapters/hip/event.cpp +++ b/source/adapters/hip/event.cpp @@ -155,7 +155,7 @@ urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) { UR_ASSERT(numEvents > 0, UR_RESULT_ERROR_INVALID_VALUE); try { - ScopedContext Active(phEventWaitList[0]->getContext()->getDevices()[0]); + ScopedDevice Active(phEventWaitList[0]->getContext()->getDevices()[0]); auto WaitFunc = [](ur_event_handle_t Event) -> ur_result_t { UR_ASSERT(Event, UR_RESULT_ERROR_INVALID_EVENT); @@ -178,8 +178,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent, UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); switch (propName) { - case UR_EVENT_INFO_COMMAND_QUEUE: + case UR_EVENT_INFO_COMMAND_QUEUE: { + // If the runtime owns the native handle, we have reference to the queue. + // Otherwise, the event handle comes from an interop API with no RT refs. + if (!hEvent->getQueue()) { + setErrorMessage("Command queue info cannot be queried for the event. The " + "event object was created from a native event and has no " + "valid reference to a command queue.", + UR_RESULT_ERROR_INVALID_VALUE); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } return ReturnValue(hEvent->getQueue()); + } case UR_EVENT_INFO_COMMAND_TYPE: return ReturnValue(hEvent->getCommandType()); case UR_EVENT_INFO_REFERENCE_COUNT: diff --git a/source/adapters/hip/image.cpp b/source/adapters/hip/image.cpp index bf69b8e777..7449c3ba3f 100644 --- a/source/adapters/hip/image.cpp +++ b/source/adapters/hip/image.cpp @@ -117,8 +117,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( [[maybe_unused]] ur_context_handle_t hContext, [[maybe_unused]] ur_device_handle_t hDevice, [[maybe_unused]] size_t size, [[maybe_unused]] ur_exp_external_mem_type_t memHandleType, - [[maybe_unused]] ur_exp_interop_mem_desc_t *pInteropMemDesc, - [[maybe_unused]] ur_exp_interop_mem_handle_t *phInteropMem) { + [[maybe_unused]] ur_exp_external_mem_desc_t *pExternalMemDesc, + [[maybe_unused]] ur_exp_external_mem_handle_t *phExternalMem) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -127,15 +127,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( [[maybe_unused]] ur_device_handle_t hDevice, [[maybe_unused]] const ur_image_format_t *pImageFormat, [[maybe_unused]] const ur_image_desc_t *pImageDesc, - [[maybe_unused]] ur_exp_interop_mem_handle_t hInteropMem, + [[maybe_unused]] ur_exp_external_mem_handle_t hExternalMem, [[maybe_unused]] ur_exp_image_mem_native_handle_t *phImageMem) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp( [[maybe_unused]] ur_context_handle_t hContext, [[maybe_unused]] ur_device_handle_t hDevice, - [[maybe_unused]] ur_exp_interop_mem_handle_t hInteropMem) { + [[maybe_unused]] uint64_t offset, [[maybe_unused]] uint64_t size, + [[maybe_unused]] ur_exp_external_mem_handle_t hExternalMem, + [[maybe_unused]] void **phRetMem) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp( + [[maybe_unused]] ur_context_handle_t hContext, + [[maybe_unused]] ur_device_handle_t hDevice, + [[maybe_unused]] ur_exp_external_mem_handle_t hExternalMem) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -143,21 +152,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( [[maybe_unused]] ur_context_handle_t hContext, [[maybe_unused]] ur_device_handle_t hDevice, [[maybe_unused]] ur_exp_external_semaphore_type_t semHandleType, - [[maybe_unused]] ur_exp_interop_semaphore_desc_t *pInteropSemaphoreDesc, - [[maybe_unused]] ur_exp_interop_semaphore_handle_t *phInteropSemaphore) { + [[maybe_unused]] ur_exp_external_semaphore_desc_t *pExternalSemaphoreDesc, + [[maybe_unused]] ur_exp_external_semaphore_handle_t *phExternalSemaphore) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( [[maybe_unused]] ur_context_handle_t hContext, [[maybe_unused]] ur_device_handle_t hDevice, - [[maybe_unused]] ur_exp_interop_semaphore_handle_t hInteropSemaphore) { + [[maybe_unused]] ur_exp_external_semaphore_handle_t hExternalSemaphore) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( [[maybe_unused]] ur_queue_handle_t hQueue, - [[maybe_unused]] ur_exp_interop_semaphore_handle_t hSemaphore, + [[maybe_unused]] ur_exp_external_semaphore_handle_t hSemaphore, [[maybe_unused]] bool hasValue, [[maybe_unused]] uint64_t waitValue, [[maybe_unused]] uint32_t numEventsInWaitList, [[maybe_unused]] const ur_event_handle_t *phEventWaitList, @@ -167,7 +176,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( [[maybe_unused]] ur_queue_handle_t hQueue, - [[maybe_unused]] ur_exp_interop_semaphore_handle_t hSemaphore, + [[maybe_unused]] ur_exp_external_semaphore_handle_t hSemaphore, [[maybe_unused]] bool hasValue, [[maybe_unused]] uint64_t signalValue, [[maybe_unused]] uint32_t numEventsInWaitList, [[maybe_unused]] const ur_event_handle_t *phEventWaitList, diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp index aa46843963..60931cd014 100644 --- a/source/adapters/hip/kernel.cpp +++ b/source/adapters/hip/kernel.cpp @@ -20,7 +20,7 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName, std::unique_ptr RetKernel{nullptr}; try { - ScopedContext Active(hProgram->getDevice()); + ScopedDevice Active(hProgram->getDevice()); hipFunction_t HIPFunc; hipError_t KernelError = @@ -127,6 +127,10 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, &Bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, hKernel->get())); return ReturnValue(uint64_t(Bytes)); } + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE: + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE: + // FIXME: could be added + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: break; } @@ -167,11 +171,11 @@ urKernelGetNativeHandle(ur_kernel_handle_t, ur_native_handle_t *) { UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, size_t localWorkSize, size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) { - (void)hKernel; - (void)localWorkSize; - (void)dynamicSharedMemorySize; - *pGroupCountRet = 1; - return UR_RESULT_SUCCESS; + std::ignore = hKernel; + std::ignore = localWorkSize; + std::ignore = dynamicSharedMemorySize; + std::ignore = pGroupCountRet; + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue( @@ -373,7 +377,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( MaxThreadsPerBlock[2] = hQueue->Device->getMaxBlockDimZ(); ur_device_handle_t Device = hQueue->getDevice(); - ScopedContext Active(Device); + ScopedDevice Active(Device); guessLocalWorkSize(Device, ThreadsPerBlock, pGlobalWorkSize, workDim, MaxThreadsPerBlock); diff --git a/source/adapters/hip/memory.cpp b/source/adapters/hip/memory.cpp index 5f06567064..aa7b5f4040 100644 --- a/source/adapters/hip/memory.cpp +++ b/source/adapters/hip/memory.cpp @@ -135,7 +135,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( if (PerformInitialCopy && HostPtr) { // Perform initial copy to every device in context for (auto &Device : hContext->getDevices()) { - ScopedContext Active(Device); + ScopedDevice Active(Device); // getPtr may allocate mem if not already allocated const auto &Ptr = std::get(URMemObj->Mem).getPtr(Device); UR_CHECK_ERROR(hipMemcpyHtoD(Ptr, HostPtr, size)); @@ -238,7 +238,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory, // FIXME: Only getting info for the first device in the context. This // should be fine in general auto Device = hMemory->getContext()->getDevices()[0]; - ScopedContext Active(Device); + ScopedDevice Active(Device); UrReturnHelper ReturnValue(propSize, pMemInfo, pPropSizeRet); @@ -375,7 +375,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( if (PerformInitialCopy) { for (const auto &Dev : hContext->getDevices()) { - ScopedContext Active(Dev); + ScopedDevice Active(Dev); hipStream_t Stream{0}; // Use default stream UR_CHECK_ERROR( enqueueMigrateMemoryToDeviceIfNeeded(URMemObj.get(), Dev, Stream)); @@ -401,7 +401,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t hMemory, UR_ASSERT(hMemory->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); // FIXME: only getting infor for first image in ctx auto Device = hMemory->getContext()->getDevices()[0]; - ScopedContext Active(Device); + ScopedDevice Active(Device); UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); try { @@ -474,7 +474,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) { ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem, const ur_device_handle_t hDevice) { - ScopedContext Active(hDevice); + ScopedDevice Active(hDevice); auto DeviceIdx = Mem->getContext()->getDeviceIndex(hDevice); ur_lock LockGuard(Mem->MemoryAllocationMutex); @@ -498,7 +498,7 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem, UR_CHECK_ERROR(hipMalloc(&DevPtr, Buffer.Size)); } } else { - hipArray *ImageArray; + hipArray *ImageArray{}; hipSurfaceObject_t Surface; try { auto &Image = std::get(Mem->Mem); @@ -640,7 +640,7 @@ ur_result_t enqueueMigrateMemoryToDeviceIfNeeded( if (Mem->HaveMigratedToDeviceSinceLastWrite[DeviceIdx]) return UR_RESULT_SUCCESS; - ScopedContext Active(hDevice); + ScopedDevice Active(hDevice); if (Mem->isBuffer()) { UR_CHECK_ERROR(enqueueMigrateBufferToDevice(Mem, hDevice, Stream)); } else { diff --git a/source/adapters/hip/memory.hpp b/source/adapters/hip/memory.hpp index 3ec1e8f4e9..b97f9d6b00 100644 --- a/source/adapters/hip/memory.hpp +++ b/source/adapters/hip/memory.hpp @@ -162,7 +162,7 @@ struct BufferMem { UR_CHECK_ERROR(hipHostUnregister(HostPtr)); break; case AllocMode::AllocHostPtr: - UR_CHECK_ERROR(hipFreeHost(HostPtr)); + UR_CHECK_ERROR(hipHostFree(HostPtr)); } return UR_RESULT_SUCCESS; } diff --git a/source/adapters/hip/platform.cpp b/source/adapters/hip/platform.cpp index 8671d70a57..ebfd422a3b 100644 --- a/source/adapters/hip/platform.cpp +++ b/source/adapters/hip/platform.cpp @@ -77,17 +77,15 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries, for (auto i = 0u; i < static_cast(NumDevices); ++i) { hipDevice_t Device; UR_CHECK_ERROR(hipDeviceGet(&Device, i)); - hipCtx_t Context; - UR_CHECK_ERROR(hipDevicePrimaryCtxRetain(&Context, Device)); hipEvent_t EvBase; UR_CHECK_ERROR(hipEventCreate(&EvBase)); // Use the default stream to record base event counter UR_CHECK_ERROR(hipEventRecord(EvBase, 0)); - Platform.Devices.emplace_back(new ur_device_handle_t_{ - Device, Context, EvBase, &Platform, i}); + Platform.Devices.emplace_back( + new ur_device_handle_t_{Device, EvBase, &Platform, i}); - ScopedContext Active(Platform.Devices.front().get()); + ScopedDevice Active(Platform.Devices.front().get()); } } catch (const std::bad_alloc &) { // Signal out-of-memory situation diff --git a/source/adapters/hip/program.cpp b/source/adapters/hip/program.cpp index 902e78aa9d..b1d7d28c47 100644 --- a/source/adapters/hip/program.cpp +++ b/source/adapters/hip/program.cpp @@ -313,7 +313,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t, ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext Active(hProgram->getDevice()); + ScopedDevice Active(hProgram->getDevice()); hProgram->buildProgram(pOptions); hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_EXECUTABLE; @@ -403,14 +403,14 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName, return ReturnValue(1u); case UR_PROGRAM_INFO_DEVICES: return ReturnValue(&hProgram->getContext()->getDevices()[0], 1); - case UR_PROGRAM_INFO_SOURCE: - return ReturnValue(hProgram->Binary); case UR_PROGRAM_INFO_BINARY_SIZES: return ReturnValue(&hProgram->BinarySizeInBytes, 1); case UR_PROGRAM_INFO_BINARIES: return ReturnValue(&hProgram->Binary, 1); case UR_PROGRAM_INFO_KERNEL_NAMES: return getKernelNames(hProgram); + case UR_PROGRAM_INFO_IL: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: break; } @@ -442,7 +442,7 @@ urProgramRelease(ur_program_handle_t hProgram) { ur_result_t Result = UR_RESULT_ERROR_INVALID_PROGRAM; try { - ScopedContext Active(hProgram->getDevice()); + ScopedDevice Active(hProgram->getDevice()); auto HIPModule = hProgram->get(); if (HIPModule) { UR_CHECK_ERROR(hipModuleUnload(HIPModule)); diff --git a/source/adapters/hip/queue.cpp b/source/adapters/hip/queue.cpp index c41bc53a08..5ab28d45ba 100644 --- a/source/adapters/hip/queue.cpp +++ b/source/adapters/hip/queue.cpp @@ -135,10 +135,10 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, } if (URFlags & UR_QUEUE_FLAG_PRIORITY_HIGH) { - ScopedContext Active(hDevice); + ScopedDevice Active(hDevice); UR_CHECK_ERROR(hipDeviceGetStreamPriorityRange(nullptr, &Priority)); } else if (URFlags & UR_QUEUE_FLAG_PRIORITY_LOW) { - ScopedContext Active(hDevice); + ScopedDevice Active(hDevice); UR_CHECK_ERROR(hipDeviceGetStreamPriorityRange(&Priority, nullptr)); } } @@ -225,7 +225,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) { if (!hQueue->backendHasOwnership()) return UR_RESULT_SUCCESS; - ScopedContext Active(hQueue->getDevice()); + ScopedDevice Active(hQueue->getDevice()); hQueue->forEachStream([](hipStream_t S) { UR_CHECK_ERROR(hipStreamSynchronize(S)); @@ -251,7 +251,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) { try { - ScopedContext Active(hQueue->getDevice()); + ScopedDevice Active(hQueue->getDevice()); hQueue->syncStreams([&Result](hipStream_t S) { UR_CHECK_ERROR(hipStreamSynchronize(S)); @@ -283,7 +283,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t) { UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle(ur_queue_handle_t hQueue, ur_queue_native_desc_t *, ur_native_handle_t *phNativeQueue) { - ScopedContext Active(hQueue->getDevice()); + ScopedDevice Active(hQueue->getDevice()); *phNativeQueue = reinterpret_cast(hQueue->getNextComputeStream()); return UR_RESULT_SUCCESS; @@ -299,7 +299,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( ur_native_handle_t hNativeQueue, ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_queue_native_properties_t *pProperties, ur_queue_handle_t *phQueue) { - (void)hDevice; + if (!hDevice && hContext->getDevices().size() == 1) + hDevice = hContext->getDevices().front(); unsigned int HIPFlags; hipStream_t HIPStream = reinterpret_cast(hNativeQueue); diff --git a/source/adapters/hip/ur_interface_loader.cpp b/source/adapters/hip/ur_interface_loader.cpp index af9b8fa9c3..1454ddfdf1 100644 --- a/source/adapters/hip/ur_interface_loader.cpp +++ b/source/adapters/hip/ur_interface_loader.cpp @@ -298,6 +298,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( pDdiTable->pfnCommandGetInfoExp = urCommandBufferCommandGetInfoExp; pDdiTable->pfnReleaseCommandExp = urCommandBufferReleaseCommandExp; pDdiTable->pfnRetainCommandExp = urCommandBufferRetainCommandExp; + pDdiTable->pfnUpdateWaitEventsExp = urCommandBufferUpdateWaitEventsExp; + pDdiTable->pfnUpdateSignalEventExp = urCommandBufferUpdateSignalEventExp; return retVal; } diff --git a/source/adapters/hip/usm.cpp b/source/adapters/hip/usm.cpp index 79337ba87a..5e28f3592d 100644 --- a/source/adapters/hip/usm.cpp +++ b/source/adapters/hip/usm.cpp @@ -18,6 +18,13 @@ #include "ur_util.hpp" #include "usm.hpp" +namespace umf { +ur_result_t getProviderNativeError(const char *, int32_t) { + // TODO: implement when UMF supports HIP + return UR_RESULT_ERROR_UNKNOWN; +} +} // namespace umf + /// USM: Implements USM Host allocations using HIP Pinned Memory UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, @@ -108,7 +115,7 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t, ur_usm_device_mem_flags_t, size_t Size, [[maybe_unused]] uint32_t Alignment) { try { - ScopedContext Active(Device); + ScopedDevice Active(Device); UR_CHECK_ERROR(hipMalloc(ResultPtr, Size)); } catch (ur_result_t Err) { return Err; @@ -124,7 +131,7 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t, ur_usm_device_mem_flags_t, size_t Size, [[maybe_unused]] uint32_t Alignment) { try { - ScopedContext Active(Device); + ScopedDevice Active(Device); UR_CHECK_ERROR(hipMallocManaged(ResultPtr, Size, hipMemAttachGlobal)); } catch (ur_result_t Err) { return Err; @@ -218,7 +225,7 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem, void *Base = nullptr; UR_CHECK_ERROR(hipPointerGetAttribute( &Base, HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR, - (hipDeviceptr_t)pMem)); + reinterpret_cast(const_cast(pMem)))); return ReturnValue(Base); } } @@ -340,7 +347,7 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, HostMemPool = umf::poolMakeUniqueFromOps( - &UMF_DISJOINT_POOL_OPS, std::move(MemProvider), + umfDisjointPoolOps(), std::move(MemProvider), &this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Host]) .second; @@ -349,7 +356,7 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, umf::memoryProviderMakeUnique(Context, Device) .second; DeviceMemPool = umf::poolMakeUniqueFromOps( - &UMF_DISJOINT_POOL_OPS, std::move(MemProvider), + umfDisjointPoolOps(), std::move(MemProvider), &this->DisjointPoolConfigs .Configs[usm::DisjointPoolMemType::Device]) .second; @@ -358,7 +365,7 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, umf::memoryProviderMakeUnique(Context, Device) .second; SharedMemPool = umf::poolMakeUniqueFromOps( - &UMF_DISJOINT_POOL_OPS, std::move(MemProvider), + umfDisjointPoolOps(), std::move(MemProvider), &this->DisjointPoolConfigs .Configs[usm::DisjointPoolMemType::Shared]) .second; diff --git a/source/adapters/hip/usm.hpp b/source/adapters/hip/usm.hpp index a1c3964263..2149ac26ba 100644 --- a/source/adapters/hip/usm.hpp +++ b/source/adapters/hip/usm.hpp @@ -81,6 +81,12 @@ class USMMemoryProvider { umf_result_t purge_force(void *, size_t) { return UMF_RESULT_ERROR_NOT_SUPPORTED; }; + umf_result_t allocation_merge(void *, void *, size_t) { + return UMF_RESULT_ERROR_UNKNOWN; + } + umf_result_t allocation_split(void *, size_t, size_t) { + return UMF_RESULT_ERROR_UNKNOWN; + } virtual const char *get_name() = 0; virtual ~USMMemoryProvider() = default; diff --git a/source/adapters/hip/usm_p2p.cpp b/source/adapters/hip/usm_p2p.cpp index d0d25c2092..5a3effd3c8 100644 --- a/source/adapters/hip/usm_p2p.cpp +++ b/source/adapters/hip/usm_p2p.cpp @@ -14,7 +14,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PEnablePeerAccessExp( ur_device_handle_t commandDevice, ur_device_handle_t peerDevice) { try { - ScopedContext active(commandDevice); + ScopedDevice active(commandDevice); UR_CHECK_ERROR(hipDeviceEnablePeerAccess(peerDevice->get(), 0)); } catch (ur_result_t err) { return err; @@ -25,7 +25,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PEnablePeerAccessExp( UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PDisablePeerAccessExp( ur_device_handle_t commandDevice, ur_device_handle_t peerDevice) { try { - ScopedContext active(commandDevice); + ScopedDevice active(commandDevice); UR_CHECK_ERROR(hipDeviceDisablePeerAccess(peerDevice->get())); } catch (ur_result_t err) { return err; @@ -42,7 +42,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( int value; hipDeviceP2PAttr hipAttr; try { - ScopedContext active(commandDevice); + ScopedDevice active(commandDevice); switch (propName) { case UR_EXP_PEER_INFO_UR_PEER_ACCESS_SUPPORTED: { hipAttr = hipDevP2PAttrAccessSupported; diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt index 028fb779cb..ef7abaf051 100644 --- a/source/adapters/level_zero/CMakeLists.txt +++ b/source/adapters/level_zero/CMakeLists.txt @@ -3,175 +3,212 @@ # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +if(UR_BUILD_ADAPTER_L0) + set(ADAPTER_LIB_TYPE SHARED) + if(UR_STATIC_ADAPTER_L0) + set(ADAPTER_LIB_TYPE STATIC) + endif() -set(TARGET_NAME ur_adapter_level_zero) -set(UR_LEVEL_ZERO_LOADER_LIBRARY "" CACHE FILEPATH "Path of the Level Zero Loader library") -set(UR_LEVEL_ZERO_INCLUDE_DIR "" CACHE FILEPATH "Directory containing the Level Zero Headers") -set(UR_LEVEL_ZERO_LOADER_REPO "" CACHE STRING "Github repo to get the Level Zero loader sources from") -set(UR_LEVEL_ZERO_LOADER_TAG "" CACHE STRING " GIT tag of the Level Loader taken from github repo") - -# Copy Level Zero loader/headers locally to the build to avoid leaking their path. -set(LEVEL_ZERO_COPY_DIR ${CMAKE_CURRENT_BINARY_DIR}/level_zero_loader) -if (NOT UR_LEVEL_ZERO_LOADER_LIBRARY STREQUAL "") - get_filename_component(LEVEL_ZERO_LIB_NAME "${UR_LEVEL_ZERO_LOADER_LIBRARY}" NAME) - set(LEVEL_ZERO_LIBRARY ${LEVEL_ZERO_COPY_DIR}/${LEVEL_ZERO_LIB_NAME}) - message(STATUS "Level Zero Adapter: Copying Level Zero loader to local build tree") - file(COPY ${UR_LEVEL_ZERO_LOADER_LIBRARY} DESTINATION ${LEVEL_ZERO_COPY_DIR} FOLLOW_SYMLINK_CHAIN) -endif() -if (NOT UR_LEVEL_ZERO_INCLUDE_DIR STREQUAL "") - set(LEVEL_ZERO_INCLUDE_DIR ${LEVEL_ZERO_COPY_DIR}) - message(STATUS "Level Zero Adapter: Copying Level Zero headers to local build tree") - file(COPY ${UR_LEVEL_ZERO_INCLUDE_DIR}/ DESTINATION ${LEVEL_ZERO_COPY_DIR}) -endif() - -if (NOT DEFINED LEVEL_ZERO_LIBRARY OR NOT DEFINED LEVEL_ZERO_INCLUDE_DIR) - message(STATUS "Level Zero Adapter: Download Level Zero loader and headers from github.com") - - # Workaround warnings/errors for Level Zero build - set(CMAKE_CXX_FLAGS_BAK "${CMAKE_CXX_FLAGS}") - if (UNIX) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-but-set-variable") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-pedantic") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-stringop-truncation") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-c++98-compat-extra-semi") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-warning-option") + add_ur_adapter(ur_adapter_level_zero ${ADAPTER_LIB_TYPE} + ${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/adapter.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/adapter.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/command_buffer.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/command_buffer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/common.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/context.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/device.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/event.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/usm.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/memory.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/kernel.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/physical_mem.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/platform.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/program.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/queue.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/sampler.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/tensor_map.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/ur_level_zero.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/context.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_native.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/usm.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/memory.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/physical_mem.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/program.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/image.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp + ) + install_ur_library(ur_adapter_level_zero) + + if(UR_STATIC_ADAPTER_L0) + target_compile_definitions(ur_adapter_level_zero PUBLIC UR_STATIC_ADAPTER_LEVEL_ZERO) + + # 'utils' target from 'level-zero-loader' includes path which is prefixed + # in the source directory, this breaks the installation of 'utils' target. + set_target_properties(utils PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "") + install(TARGETS ur_umf LevelZeroLoader LevelZeroLoader-Headers ze_loader utils + EXPORT ${PROJECT_NAME}-targets + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ) endif() - if (UR_LEVEL_ZERO_LOADER_REPO STREQUAL "") - set(UR_LEVEL_ZERO_LOADER_REPO "https://github.com/oneapi-src/level-zero.git") + # Ensure UR flags are propagated to level zero + # Note: UR compile options cause issues under MSVC + if(NOT MSVC) + foreach(TARGET IN ITEMS ze_loader ze_validation_layer ze_tracing_layer ze_null) + if (TARGET TARGET) + add_ur_target_compile_options(${TARGET}) + add_ur_target_link_options(${TARGET}) + target_compile_options(${TARGET} PRIVATE + $<$:-Wno-error -Wno-unused-parameter> + $<$:/WX- /UUNICODE> + ) + endif() + endforeach() endif() - if (UR_LEVEL_ZERO_LOADER_TAG STREQUAL "") - set(UR_LEVEL_ZERO_LOADER_TAG v1.17.6) + + if(NOT WIN32) + target_sources(ur_adapter_level_zero + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/adapter_lib_init_linux.cpp + ) endif() - # Disable due to a bug https://github.com/oneapi-src/level-zero/issues/104 - set(CMAKE_INCLUDE_CURRENT_DIR OFF) - # Prevent L0 loader from exporting extra symbols - set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS OFF) + # TODO: fix level_zero adapter conversion warnings + target_compile_options(ur_adapter_level_zero PRIVATE + $<$:/wd4805 /wd4244> + ) - message(STATUS "Level Zero Adapter: Will fetch Level Zero Loader from ${UR_LEVEL_ZERO_LOADER_REPO}") - include(FetchContent) - FetchContent_Declare(level-zero-loader - GIT_REPOSITORY ${UR_LEVEL_ZERO_LOADER_REPO} - GIT_TAG ${UR_LEVEL_ZERO_LOADER_TAG} + set_target_properties(ur_adapter_level_zero PROPERTIES + VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}" + SOVERSION "${PROJECT_VERSION_MAJOR}" ) - if(MSVC) - set(USE_Z7 ON) - endif() - FetchContent_MakeAvailable(level-zero-loader) - FetchContent_GetProperties(level-zero-loader) - # Restore original flags - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS_BAK}") + if (WIN32) + # 0x800: Search for the DLL only in the System32 folder + target_link_options(ur_adapter_level_zero PRIVATE /DEPENDENTLOADFLAG:0x800) + endif() - target_compile_options(ze_loader PRIVATE - $<$,GNU;Clang;Intel;IntelLLVM>:-Wno-error> - $<$:/WX- /UUNICODE> + target_link_libraries(ur_adapter_level_zero PRIVATE + ${PROJECT_NAME}::headers + ${PROJECT_NAME}::common + ${PROJECT_NAME}::umf + LevelZeroLoader + LevelZeroLoader-Headers ) - set(LEVEL_ZERO_LIBRARY ze_loader) - set(LEVEL_ZERO_INCLUDE_DIR - ${level-zero-loader_SOURCE_DIR}/include CACHE PATH "Path to Level Zero Headers") + target_include_directories(ur_adapter_level_zero PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}/../../" + LevelZeroLoader-Headers + ) endif() -add_library (LevelZeroLoader INTERFACE) -# The MSVC linker does not like / at the start of a path, so to work around this -# we split it into a link library and a library path, where the path is allowed -# to have leading /. -get_filename_component(LEVEL_ZERO_LIBRARY_SRC "${LEVEL_ZERO_LIBRARY}" DIRECTORY) -get_filename_component(LEVEL_ZERO_LIB_NAME "${LEVEL_ZERO_LIBRARY}" NAME) -target_link_directories(LevelZeroLoader - INTERFACE "${LEVEL_ZERO_LIBRARY_SRC}" -) -target_link_libraries(LevelZeroLoader - INTERFACE "${LEVEL_ZERO_LIB_NAME}" -) - -add_library (LevelZeroLoader-Headers INTERFACE) -target_include_directories(LevelZeroLoader-Headers - INTERFACE "${LEVEL_ZERO_INCLUDE_DIR}" -) - -add_ur_adapter(${TARGET_NAME} - SHARED - ${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/adapter.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/adapter.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/command_buffer.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/command_buffer.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/common.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/context.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/device.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/event.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/usm.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/memory.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/kernel.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/physical_mem.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/platform.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/program.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/queue_api.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/queue.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/sampler.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_factory.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/v2/context.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_cache.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/ur_level_zero.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/context.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_native.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/usm.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/memory.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/physical_mem.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/program.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/queue_api.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/tensor_map.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/image.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/v2/context.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_cache.cpp -) - -if(NOT WIN32) - target_sources(ur_adapter_level_zero - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/adapter_lib_init_linux.cpp +if(UR_BUILD_ADAPTER_L0_V2) + add_ur_adapter(ur_adapter_level_zero_v2 + SHARED + # sources shared with legacy adapter + ${CMAKE_CURRENT_SOURCE_DIR}/adapter.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/common.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/device.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/platform.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/program.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/adapter.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tensor_map.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/program.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp + # v2-only sources + ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_cache.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/context.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool_cache.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_provider_counter.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_provider_normal.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_provider.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/event.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/kernel.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/memory.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_api.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/api.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_cache.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/context.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool_cache.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_pool.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_provider_counter.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_provider_normal.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/event.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/kernel.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/memory.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_api.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_create.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.cpp ) -endif() + install_ur_library(ur_adapter_level_zero_v2) -# TODO: fix level_zero adapter conversion warnings -target_compile_options(${TARGET_NAME} PRIVATE - $<$:/wd4805 /wd4244> -) + if(NOT WIN32) + # api.cpp contains NOT_SUPPORTED functions-only + set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/v2/api.cpp + PROPERTIES APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-unused-parameter") -set_target_properties(${TARGET_NAME} PROPERTIES - VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}" - SOVERSION "${PROJECT_VERSION_MAJOR}" -) + target_sources(ur_adapter_level_zero_v2 + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/adapter_lib_init_linux.cpp + ) + endif() -if (WIN32) - # 0x800: Search for the DLL only in the System32 folder - target_link_options(ur_adapter_level_zero PUBLIC /DEPENDENTLOADFLAG:0x800) -endif() + target_compile_definitions(ur_adapter_level_zero_v2 PUBLIC UR_ADAPTER_LEVEL_ZERO_V2) + + # TODO: fix level_zero adapter conversion warnings + target_compile_options(ur_adapter_level_zero_v2 PRIVATE + $<$:/wd4805 /wd4244> + ) -target_link_libraries(${TARGET_NAME} PRIVATE - ${PROJECT_NAME}::headers - ${PROJECT_NAME}::common - LevelZeroLoader - LevelZeroLoader-Headers -) - -target_include_directories(${TARGET_NAME} PRIVATE - "${CMAKE_CURRENT_SOURCE_DIR}/../../" - LevelZeroLoader-Headers -) + set_target_properties(ur_adapter_level_zero_v2 PROPERTIES + VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}" + SOVERSION "${PROJECT_VERSION_MAJOR}" + ) + + if (WIN32) + # 0x800: Search for the DLL only in the System32 folder + target_link_options(ur_adapter_level_zero_v2 PUBLIC /DEPENDENTLOADFLAG:0x800) + endif() + + target_link_libraries(ur_adapter_level_zero_v2 PRIVATE + ${PROJECT_NAME}::headers + ${PROJECT_NAME}::common + ${PROJECT_NAME}::umf + LevelZeroLoader + LevelZeroLoader-Headers + ) + + target_include_directories(ur_adapter_level_zero_v2 PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}/../.." + "${CMAKE_CURRENT_SOURCE_DIR}/../../ur" + LevelZeroLoader-Headers + ) +endif() diff --git a/source/adapters/level_zero/adapter.cpp b/source/adapters/level_zero/adapter.cpp index 0605b9a40c..be79c09fef 100644 --- a/source/adapters/level_zero/adapter.cpp +++ b/source/adapters/level_zero/adapter.cpp @@ -12,6 +12,14 @@ #include "ur_level_zero.hpp" #include +// As windows order of unloading dlls is reversed from linux, windows will call +// umfTearDown before it could release umf objects in level_zero, so we call +// umfInit on urAdapterGet and umfAdapterTearDown to enforce the teardown of umf +// after umf objects are destructed. +#if defined(_WIN32) +#include +#endif + // Due to multiple DLLMain definitions with SYCL, Global Adapter is init at // variable creation. #if defined(_WIN32) @@ -19,7 +27,12 @@ ur_adapter_handle_t_ *GlobalAdapter = new ur_adapter_handle_t_(); #else ur_adapter_handle_t_ *GlobalAdapter; #endif - +// This is a temporary workaround on windows, where UR adapter is teardowned +// before the UR loader, which will result in access violation when we use print +// function as the overrided print function was already released with the UR +// adapter. +// TODO: Change adapters to use a common sink class in the loader instead of +// using thier own sink class that inherit from logger::Sink. class ur_legacy_sink : public logger::Sink { public: ur_legacy_sink(std::string logger_name = "", bool skip_prefix = true) @@ -32,10 +45,39 @@ class ur_legacy_sink : public logger::Sink { fprintf(stderr, "%s", msg.c_str()); } - ~ur_legacy_sink() = default; + ~ur_legacy_sink() { +#if defined(_WIN32) + logger::isTearDowned = true; +#endif + }; }; -ur_result_t initPlatforms(PlatformVec &platforms) noexcept try { +// Find the corresponding ZesDevice Handle for a given ZeDevice +ur_result_t getZesDeviceHandle(zes_uuid_t coreDeviceUuid, + zes_device_handle_t *ZesDevice, + uint32_t *SubDeviceId, ze_bool_t *SubDevice) { + uint32_t ZesDriverCount = 0; + std::vector ZesDrivers; + std::vector ZesDevices; + ze_result_t ZesResult = ZE_RESULT_ERROR_INVALID_ARGUMENT; + ZE2UR_CALL(GlobalAdapter->getSysManDriversFunctionPtr, + (&ZesDriverCount, nullptr)); + ZesDrivers.resize(ZesDriverCount); + ZE2UR_CALL(GlobalAdapter->getSysManDriversFunctionPtr, + (&ZesDriverCount, ZesDrivers.data())); + for (uint32_t I = 0; I < ZesDriverCount; ++I) { + ZesResult = ZE_CALL_NOCHECK( + GlobalAdapter->getDeviceByUUIdFunctionPtr, + (ZesDrivers[I], coreDeviceUuid, ZesDevice, SubDevice, SubDeviceId)); + if (ZesResult == ZE_RESULT_SUCCESS) { + return UR_RESULT_SUCCESS; + } + } + return UR_RESULT_ERROR_INVALID_ARGUMENT; +} + +ur_result_t initPlatforms(PlatformVec &platforms, + ze_result_t ZesResult) noexcept try { uint32_t ZeDriverCount = 0; ZE2UR_CALL(zeDriverGet, (&ZeDriverCount, nullptr)); if (ZeDriverCount == 0) { @@ -43,22 +85,64 @@ ur_result_t initPlatforms(PlatformVec &platforms) noexcept try { } std::vector ZeDrivers; + std::vector ZeDevices; ZeDrivers.resize(ZeDriverCount); ZE2UR_CALL(zeDriverGet, (&ZeDriverCount, ZeDrivers.data())); for (uint32_t I = 0; I < ZeDriverCount; ++I) { + // Keep track of the first platform init for this Driver + bool DriverPlatformInit = false; + ze_device_properties_t device_properties{}; + device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; + uint32_t ZeDeviceCount = 0; + ZE2UR_CALL(zeDeviceGet, (ZeDrivers[I], &ZeDeviceCount, nullptr)); + ZeDevices.resize(ZeDeviceCount); + ZE2UR_CALL(zeDeviceGet, (ZeDrivers[I], &ZeDeviceCount, ZeDevices.data())); auto platform = std::make_unique(ZeDrivers[I]); - UR_CALL(platform->initialize()); - - // Save a copy in the cache for future uses. - platforms.push_back(std::move(platform)); + // Check if this driver has GPU Devices + for (uint32_t D = 0; D < ZeDeviceCount; ++D) { + ZE2UR_CALL(zeDeviceGetProperties, (ZeDevices[D], &device_properties)); + if (ZE_DEVICE_TYPE_GPU == device_properties.type) { + // Check if this driver's platform has already been init. + if (!DriverPlatformInit) { + // If this Driver is a GPU, save it as a usable platform. + UR_CALL(platform->initialize()); + + // Save a copy in the cache for future uses. + platforms.push_back(std::move(platform)); + // Mark this driver's platform as init to prevent additional platforms + // from being created per driver. + DriverPlatformInit = true; + } + if (ZesResult == ZE_RESULT_SUCCESS) { + // Populate the Zes/Ze device mapping for this Ze Device into the last + // added platform which represents the current driver being queried. + ur_zes_device_handle_data_t ZesDeviceData; + zes_uuid_t ZesUUID; + std::memcpy(&ZesUUID, &device_properties.uuid, sizeof(zes_uuid_t)); + if (getZesDeviceHandle( + ZesUUID, &ZesDeviceData.ZesDevice, &ZesDeviceData.SubDeviceId, + &ZesDeviceData.SubDevice) == UR_RESULT_SUCCESS) { + platforms.back()->ZedeviceToZesDeviceMap.insert( + std::make_pair(ZeDevices[D], std::move(ZesDeviceData))); + } + } + } + } } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); } -ur_result_t adapterStateInit() { return UR_RESULT_SUCCESS; } +ur_result_t adapterStateInit() { + +#if defined(_WIN32) + umfInit(); +#endif + + return UR_RESULT_SUCCESS; +} ur_adapter_handle_t_::ur_adapter_handle_t_() : logger(logger::get_logger("level_zero")) { @@ -105,8 +189,16 @@ ur_adapter_handle_t_::ur_adapter_handle_t_() // We must only initialize the driver once, even if urPlatformGet() is // called multiple times. Declaring the return value as "static" ensures // it's only called once. - GlobalAdapter->ZeResult = - ZE_CALL_NOCHECK(zeInit, (ZE_INIT_FLAG_GPU_ONLY)); + + // Init with all flags set to enable for all driver types to be init in + // the application. + ze_init_flags_t L0InitFlags = ZE_INIT_FLAG_GPU_ONLY; + if (UrL0InitAllDrivers) { + L0InitFlags |= ZE_INIT_FLAG_VPU_ONLY; + } + logger::debug("\nzeInit with flags value of {}\n", + static_cast(L0InitFlags)); + GlobalAdapter->ZeResult = ZE_CALL_NOCHECK(zeInit, (L0InitFlags)); } assert(GlobalAdapter->ZeResult != std::nullopt); // verify that level-zero is initialized @@ -123,8 +215,36 @@ ur_adapter_handle_t_::ur_adapter_handle_t_() return; } + // Dynamically load the new L0 SysMan separate init and new EXP apis + // separately. This must be done to avoid attempting to use symbols that do + // not exist in older loader runtimes. +#ifdef _WIN32 + HMODULE processHandle = GetModuleHandle(NULL); +#else + HMODULE processHandle = nullptr; +#endif + GlobalAdapter->getDeviceByUUIdFunctionPtr = + (zes_pfnDriverGetDeviceByUuidExp_t)ur_loader::LibLoader::getFunctionPtr( + processHandle, "zesDriverGetDeviceByUuidExp"); + GlobalAdapter->getSysManDriversFunctionPtr = + (zes_pfnDriverGet_t)ur_loader::LibLoader::getFunctionPtr( + processHandle, "zesDriverGet"); + GlobalAdapter->sysManInitFunctionPtr = + (zes_pfnInit_t)ur_loader::LibLoader::getFunctionPtr(processHandle, + "zesInit"); + if (GlobalAdapter->getDeviceByUUIdFunctionPtr && + GlobalAdapter->getSysManDriversFunctionPtr && + GlobalAdapter->sysManInitFunctionPtr) { + ze_init_flags_t L0ZesInitFlags = 0; + logger::debug("\nzesInit with flags value of {}\n", + static_cast(L0ZesInitFlags)); + GlobalAdapter->ZesResult = ZE_CALL_NOCHECK( + GlobalAdapter->sysManInitFunctionPtr, (L0ZesInitFlags)); + } else { + GlobalAdapter->ZesResult = ZE_RESULT_ERROR_UNINITIALIZED; + } - ur_result_t err = initPlatforms(platforms); + ur_result_t err = initPlatforms(platforms, *GlobalAdapter->ZesResult); if (err == UR_RESULT_SUCCESS) { result = std::move(platforms); } else { @@ -140,12 +260,11 @@ void globalAdapterOnDemandCleanup() { } ur_result_t adapterStateTeardown() { - bool LeakFound = false; - // Print the balance of various create/destroy native calls. // The idea is to verify if the number of create(+) and destroy(-) calls are // matched. if (ZeCallCount && (UrL0LeaksDebug) != 0) { + bool LeakFound = false; // clang-format off // // The format of this table is such that each row accounts for a @@ -228,19 +347,22 @@ ur_result_t adapterStateTeardown() { ZeCallCount->clear(); delete ZeCallCount; ZeCallCount = nullptr; + if (LeakFound) + return UR_RESULT_ERROR_INVALID_MEM_OBJECT; } - if (LeakFound) - return UR_RESULT_ERROR_INVALID_MEM_OBJECT; - // Due to multiple DLLMain definitions with SYCL, register to cleanup the - // Global Adapter after refcnt is 0 + + // Due to multiple DLLMain definitions with SYCL, register to cleanup the + // Global Adapter after refcnt is 0 #if defined(_WIN32) + umfTearDown(); std::atexit(globalAdapterOnDemandCleanup); #endif return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urAdapterGet( +namespace ur::level_zero { +ur_result_t urAdapterGet( uint32_t NumEntries, ///< [in] the number of platforms to be added to ///< phAdapters. If phAdapters is not NULL, then ///< NumEntries should be greater than zero, otherwise @@ -281,7 +403,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGet( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urAdapterRelease(ur_adapter_handle_t) { +ur_result_t urAdapterRelease(ur_adapter_handle_t) { // Check first if the Adapter pointer is valid if (GlobalAdapter) { std::lock_guard Lock{GlobalAdapter->Mutex}; @@ -293,7 +415,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterRelease(ur_adapter_handle_t) { return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urAdapterRetain(ur_adapter_handle_t) { +ur_result_t urAdapterRetain(ur_adapter_handle_t) { if (GlobalAdapter) { std::lock_guard Lock{GlobalAdapter->Mutex}; GlobalAdapter->RefCount++; @@ -302,7 +424,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterRetain(ur_adapter_handle_t) { return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetLastError( +ur_result_t urAdapterGetLastError( ur_adapter_handle_t, ///< [in] handle of the platform instance const char **Message, ///< [out] pointer to a C string where the adapter ///< specific error message will be stored. @@ -315,11 +437,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetLastError( return ErrorMessageCode; } -UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t, - ur_adapter_info_t PropName, - size_t PropSize, - void *PropValue, - size_t *PropSizeRet) { +ur_result_t urAdapterGetInfo(ur_adapter_handle_t, ur_adapter_info_t PropName, + size_t PropSize, void *PropValue, + size_t *PropSizeRet) { UrReturnHelper ReturnValue(PropSize, PropValue, PropSizeRet); switch (PropName) { @@ -333,3 +453,4 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t, return UR_RESULT_SUCCESS; } +} // namespace ur::level_zero diff --git a/source/adapters/level_zero/adapter.hpp b/source/adapters/level_zero/adapter.hpp index 273cdb4193..53a58793e5 100644 --- a/source/adapters/level_zero/adapter.hpp +++ b/source/adapters/level_zero/adapter.hpp @@ -11,11 +11,13 @@ #include "logger/ur_logger.hpp" #include +#include #include #include #include #include #include +#include using PlatformVec = std::vector>; @@ -26,7 +28,12 @@ struct ur_adapter_handle_t_ { std::atomic RefCount = 0; std::mutex Mutex; + zes_pfnDriverGetDeviceByUuidExp_t getDeviceByUUIdFunctionPtr = nullptr; + zes_pfnDriverGet_t getSysManDriversFunctionPtr = nullptr; + zes_pfnInit_t sysManInitFunctionPtr = nullptr; + std::optional ZeResult; + std::optional ZesResult; ZeCache> PlatformCache; logger::Logger &logger; }; diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index d9dba22970..2895660d0b 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -8,7 +8,9 @@ // //===----------------------------------------------------------------------===// #include "command_buffer.hpp" +#include "helpers/kernel_helpers.hpp" #include "logger/ur_logger.hpp" +#include "ur_interface_loader.hpp" #include "ur_level_zero.hpp" /* L0 Command-buffer Extension Doc see: @@ -78,130 +80,6 @@ preferCopyEngineForFill(ur_exp_command_buffer_handle_t CommandBuffer, return UR_RESULT_SUCCESS; } -/** - * Calculates a work group size for the kernel based on the GlobalWorkSize or - * the LocalWorkSize if provided. - * @param[in][optional] Kernel The Kernel. Used when LocalWorkSize is not - * provided. - * @param[in][optional] Device The device associated with the kernel. Used when - * LocalWorkSize is not provided. - * @param[out] ZeThreadGroupDimensions Number of work groups in each dimension. - * @param[out] WG The work group size for each dimension. - * @param[in] WorkDim The number of dimensions in the kernel. - * @param[in] GlobalWorkSize The global work size. - * @param[in][optional] LocalWorkSize The local work size. - * @return UR_RESULT_SUCCESS or an error code on failure. - */ -ur_result_t calculateKernelWorkDimensions( - ur_kernel_handle_t Kernel, ur_device_handle_t Device, - ze_group_count_t &ZeThreadGroupDimensions, uint32_t (&WG)[3], - uint32_t WorkDim, const size_t *GlobalWorkSize, - const size_t *LocalWorkSize) { - - UR_ASSERT(GlobalWorkSize, UR_RESULT_ERROR_INVALID_VALUE); - // If LocalWorkSize is not provided then Kernel must be provided to query - // suggested group size. - UR_ASSERT(LocalWorkSize || Kernel, UR_RESULT_ERROR_INVALID_VALUE); - - // New variable needed because GlobalWorkSize parameter might not be of size - // 3 - size_t GlobalWorkSize3D[3]{1, 1, 1}; - std::copy(GlobalWorkSize, GlobalWorkSize + WorkDim, GlobalWorkSize3D); - - if (LocalWorkSize) { - WG[0] = ur_cast(LocalWorkSize[0]); - WG[1] = WorkDim >= 2 ? ur_cast(LocalWorkSize[1]) : 1; - WG[2] = WorkDim == 3 ? ur_cast(LocalWorkSize[2]) : 1; - } else { - // We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize3D - // values do not fit to 32-bit that the API only supports currently. - bool SuggestGroupSize = true; - for (int I : {0, 1, 2}) { - if (GlobalWorkSize3D[I] > UINT32_MAX) { - SuggestGroupSize = false; - } - } - if (SuggestGroupSize) { - ZE2UR_CALL(zeKernelSuggestGroupSize, - (Kernel->ZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1], - GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2])); - } else { - for (int I : {0, 1, 2}) { - // Try to find a I-dimension WG size that the GlobalWorkSize3D[I] is - // fully divisable with. Start with the max possible size in - // each dimension. - uint32_t GroupSize[] = { - Device->ZeDeviceComputeProperties->maxGroupSizeX, - Device->ZeDeviceComputeProperties->maxGroupSizeY, - Device->ZeDeviceComputeProperties->maxGroupSizeZ}; - GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]); - while (GlobalWorkSize3D[I] % GroupSize[I]) { - --GroupSize[I]; - } - if (GlobalWorkSize[I] / GroupSize[I] > UINT32_MAX) { - logger::debug("calculateKernelWorkDimensions: can't find a WG size " - "suitable for global work size > UINT32_MAX"); - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } - WG[I] = GroupSize[I]; - } - logger::debug("calculateKernelWorkDimensions: using computed WG " - "size = {{{}, {}, {}}}", - WG[0], WG[1], WG[2]); - } - } - - // TODO: assert if sizes do not fit into 32-bit? - switch (WorkDim) { - case 3: - ZeThreadGroupDimensions.groupCountX = - ur_cast(GlobalWorkSize3D[0] / WG[0]); - ZeThreadGroupDimensions.groupCountY = - ur_cast(GlobalWorkSize3D[1] / WG[1]); - ZeThreadGroupDimensions.groupCountZ = - ur_cast(GlobalWorkSize3D[2] / WG[2]); - break; - case 2: - ZeThreadGroupDimensions.groupCountX = - ur_cast(GlobalWorkSize3D[0] / WG[0]); - ZeThreadGroupDimensions.groupCountY = - ur_cast(GlobalWorkSize3D[1] / WG[1]); - WG[2] = 1; - break; - case 1: - ZeThreadGroupDimensions.groupCountX = - ur_cast(GlobalWorkSize3D[0] / WG[0]); - WG[1] = WG[2] = 1; - break; - - default: - logger::error("calculateKernelWorkDimensions: unsupported work_dim"); - return UR_RESULT_ERROR_INVALID_VALUE; - } - - // Error handling for non-uniform group size case - if (GlobalWorkSize3D[0] != - size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) { - logger::error("calculateKernelWorkDimensions: invalid work_dim. The range " - "is not a multiple of the group size in the 1st dimension"); - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } - if (GlobalWorkSize3D[1] != - size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) { - logger::error("calculateKernelWorkDimensions: invalid work_dim. The range " - "is not a multiple of the group size in the 2nd dimension"); - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } - if (GlobalWorkSize3D[2] != - size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) { - logger::error("calculateKernelWorkDimensions: invalid work_dim. The range " - "is not a multiple of the group size in the 3rd dimension"); - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } - - return UR_RESULT_SUCCESS; -} - /** * Helper function for finding the Level Zero events associated with the * commands in a command-buffer, each event is pointed to by a sync-point in the @@ -420,18 +298,16 @@ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( IsUpdatable(Desc ? Desc->isUpdatable : false), IsProfilingEnabled(Desc ? Desc->enableProfiling : false), IsInOrderCmdList(IsInOrderCmdList) { - urContextRetain(Context); - urDeviceRetain(Device); + ur::level_zero::urContextRetain(Context); + ur::level_zero::urDeviceRetain(Device); } -// The ur_exp_command_buffer_handle_t_ destructor releases all the memory -// objects allocated for command_buffer management. -ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { +void ur_exp_command_buffer_handle_t_::cleanupCommandBufferResources() { // Release the memory allocated to the Context stored in the command_buffer - urContextRelease(Context); + ur::level_zero::urContextRelease(Context); // Release the device - urDeviceRelease(Device); + ur::level_zero::urDeviceRelease(Device); // Release the memory allocated to the CommandList stored in the // command_buffer @@ -501,7 +377,7 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { for (auto &AssociatedKernel : KernelsList) { ReleaseIndirectMem(AssociatedKernel); - urKernelRelease(AssociatedKernel); + ur::level_zero::urKernelRelease(AssociatedKernel); } } @@ -512,16 +388,16 @@ ur_exp_command_buffer_command_handle_t_:: ur_kernel_handle_t Kernel = nullptr) : CommandBuffer(CommandBuffer), CommandId(CommandId), WorkDim(WorkDim), UserDefinedLocalSize(UserDefinedLocalSize), Kernel(Kernel) { - urCommandBufferRetainExp(CommandBuffer); + ur::level_zero::urCommandBufferRetainExp(CommandBuffer); if (Kernel) - urKernelRetain(Kernel); + ur::level_zero::urKernelRetain(Kernel); } ur_exp_command_buffer_command_handle_t_:: ~ur_exp_command_buffer_command_handle_t_() { - urCommandBufferReleaseExp(CommandBuffer); + ur::level_zero::urCommandBufferReleaseExp(CommandBuffer); if (Kernel) - urKernelRelease(Kernel); + ur::level_zero::urKernelRelease(Kernel); } void ur_exp_command_buffer_handle_t_::registerSyncPoint( @@ -558,7 +434,7 @@ ur_result_t ur_exp_command_buffer_handle_t_::getFenceForQueue( return UR_RESULT_SUCCESS; } -namespace { +namespace ur::level_zero { /** * Creates a L0 command list @@ -612,15 +488,14 @@ ur_result_t createMainCommandList(ur_context_handle_t Context, bool canBeInOrder(ur_context_handle_t Context, const ur_exp_command_buffer_desc_t *CommandBufferDesc) { // In-order command-lists are not available in old driver version. - bool CompatibleDriver = isDriverVersionNewerOrSimilar( - Context->getPlatform()->ZeDriver, 1, 3, L0_DRIVER_INORDER_MIN_VERSION); + bool CompatibleDriver = Context->getPlatform()->isDriverVersionNewerOrSimilar( + 1, 3, L0_DRIVER_INORDER_MIN_VERSION); return CompatibleDriver ? (CommandBufferDesc ? CommandBufferDesc->isInOrder : false) : false; } -} // namespace -UR_APIEXPORT ur_result_t UR_APICALL +ur_result_t urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, const ur_exp_command_buffer_desc_t *CommandBufferDesc, ur_exp_command_buffer_handle_t *CommandBuffer) { @@ -692,22 +567,23 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL +ur_result_t urCommandBufferRetainExp(ur_exp_command_buffer_handle_t CommandBuffer) { CommandBuffer->RefCount.increment(); return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL +ur_result_t urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t CommandBuffer) { if (!CommandBuffer->RefCount.decrementAndTest()) return UR_RESULT_SUCCESS; + CommandBuffer->cleanupCommandBufferResources(); delete CommandBuffer; return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL +ur_result_t urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) { UR_ASSERT(CommandBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER); // It is not allowed to append to command list from multiple threads. @@ -751,8 +627,6 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) { return UR_RESULT_SUCCESS; } -namespace { - /** * Sets the global offset for a kernel command that will be appended to the * command buffer. @@ -798,7 +672,7 @@ setKernelPendingArguments(ur_exp_command_buffer_handle_t CommandBuffer, char **ZeHandlePtr = nullptr; if (Arg.Value) { UR_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode, - CommandBuffer->Device)); + CommandBuffer->Device, nullptr, 0u)); } ZE2UR_CALL(zeKernelSetArgumentValue, (Kernel->ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr)); @@ -854,17 +728,26 @@ createCommandHandle(ur_exp_command_buffer_handle_t CommandBuffer, return UR_RESULT_SUCCESS; } -} // namespace -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( +ur_result_t urCommandBufferAppendKernelLaunchExp( ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, uint32_t WorkDim, const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + uint32_t /*numKernelAlternatives*/, + ur_kernel_handle_t * /*phKernelAlternatives*/, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - ur_exp_command_buffer_sync_point_t *RetSyncPoint, + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_exp_command_buffer_sync_point_t *RetSyncPoint, ur_event_handle_t *Event, ur_exp_command_buffer_command_handle_t *Command) { + std::ignore = NumEventsInWaitList; + std::ignore = EventWaitList; + std::ignore = Event; + UR_ASSERT(Kernel->Program, UR_RESULT_ERROR_INVALID_NULL_POINTER); + // Command handles can only be obtained from updatable command-buffers + UR_ASSERT(!(Command && !CommandBuffer->IsUpdatable), + UR_RESULT_ERROR_INVALID_OPERATION); // Lock automatically releases when this goes out of scope. std::scoped_lock Lock( @@ -881,7 +764,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ze_group_count_t ZeThreadGroupDimensions{1, 1, 1}; uint32_t WG[3]; - UR_CALL(calculateKernelWorkDimensions(Kernel, CommandBuffer->Device, + UR_CALL(calculateKernelWorkDimensions(Kernel->ZeKernel, CommandBuffer->Device, ZeThreadGroupDimensions, WG, WorkDim, GlobalWorkSize, LocalWorkSize)); @@ -893,9 +776,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( // is in use. Once the event has been signaled, the code in // CleanupCompletedEvent(Event) will do a urKernelRelease to update the // reference count on the kernel, using the kernel saved in CommandData. - UR_CALL(urKernelRetain(Kernel)); + UR_CALL(ur::level_zero::urKernelRetain(Kernel)); - if (Command && CommandBuffer->IsUpdatable) { + if (Command) { UR_CALL(createCommandHandle(CommandBuffer, Kernel, WorkDim, LocalWorkSize, *Command)); } @@ -914,14 +797,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( +ur_result_t urCommandBufferAppendUSMMemcpyExp( ur_exp_command_buffer_handle_t CommandBuffer, void *Dst, const void *Src, size_t Size, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - ur_exp_command_buffer_sync_point_t *SyncPoint) { + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint, ur_event_handle_t *Event, + ur_exp_command_buffer_command_handle_t *Command) { + std::ignore = NumEventsInWaitList; + std::ignore = EventWaitList; + std::ignore = Event; + std::ignore = Command; bool PreferCopyEngine = !IsDevicePointer(CommandBuffer->Context, Src) || !IsDevicePointer(CommandBuffer->Context, Dst); + // For better performance, Copy Engines are not preferred given Shared + // pointers on DG2. + if (CommandBuffer->Device->isDG2() && + (IsSharedPointer(CommandBuffer->Context, Src) || + IsSharedPointer(CommandBuffer->Context, Dst))) { + PreferCopyEngine = false; + } PreferCopyEngine |= UseCopyEngineForD2DCopy; return enqueueCommandBufferMemCopyHelper( @@ -929,12 +825,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( +ur_result_t urCommandBufferAppendMemBufferCopyExp( ur_exp_command_buffer_handle_t CommandBuffer, ur_mem_handle_t SrcMem, ur_mem_handle_t DstMem, size_t SrcOffset, size_t DstOffset, size_t Size, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - ur_exp_command_buffer_sync_point_t *SyncPoint) { + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint, ur_event_handle_t *Event, + ur_exp_command_buffer_command_handle_t *Command) { + std::ignore = NumEventsInWaitList; + std::ignore = EventWaitList; + std::ignore = Event; + std::ignore = Command; auto SrcBuffer = ur_cast<_ur_buffer *>(SrcMem); auto DstBuffer = ur_cast<_ur_buffer *>(DstMem); @@ -944,10 +846,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( char *ZeHandleSrc; UR_CALL(SrcBuffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, - CommandBuffer->Device)); + CommandBuffer->Device, nullptr, 0u)); char *ZeHandleDst; UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - CommandBuffer->Device)); + CommandBuffer->Device, nullptr, 0u)); bool PreferCopyEngine = (SrcBuffer->OnHost || DstBuffer->OnHost); @@ -959,14 +861,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( SyncPointWaitList, SyncPoint); } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( +ur_result_t urCommandBufferAppendMemBufferCopyRectExp( ur_exp_command_buffer_handle_t CommandBuffer, ur_mem_handle_t SrcMem, ur_mem_handle_t DstMem, ur_rect_offset_t SrcOrigin, ur_rect_offset_t DstOrigin, ur_rect_region_t Region, size_t SrcRowPitch, size_t SrcSlicePitch, size_t DstRowPitch, size_t DstSlicePitch, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - ur_exp_command_buffer_sync_point_t *SyncPoint) { + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint, ur_event_handle_t *Event, + ur_exp_command_buffer_command_handle_t *Command) { + std::ignore = NumEventsInWaitList; + std::ignore = EventWaitList; + std::ignore = Event; + std::ignore = Command; auto SrcBuffer = ur_cast<_ur_buffer *>(SrcMem); auto DstBuffer = ur_cast<_ur_buffer *>(DstMem); @@ -976,10 +884,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( char *ZeHandleSrc; UR_CALL(SrcBuffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, - CommandBuffer->Device)); + CommandBuffer->Device, nullptr, 0u)); char *ZeHandleDst; UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - CommandBuffer->Device)); + CommandBuffer->Device, nullptr, 0u)); bool PreferCopyEngine = (SrcBuffer->OnHost || DstBuffer->OnHost); @@ -992,17 +900,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( SyncPointWaitList, SyncPoint); } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( +ur_result_t urCommandBufferAppendMemBufferWriteExp( ur_exp_command_buffer_handle_t CommandBuffer, ur_mem_handle_t Buffer, size_t Offset, size_t Size, const void *Src, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - ur_exp_command_buffer_sync_point_t *SyncPoint) { + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint, ur_event_handle_t *Event, + ur_exp_command_buffer_command_handle_t *Command) { + std::ignore = NumEventsInWaitList; + std::ignore = EventWaitList; + std::ignore = Event; + std::ignore = Command; std::scoped_lock Lock(Buffer->Mutex); char *ZeHandleDst = nullptr; UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - CommandBuffer->Device)); + CommandBuffer->Device, nullptr, 0u)); // Always prefer copy engine for writes bool PreferCopyEngine = true; @@ -1014,19 +928,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( SyncPoint); } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( +ur_result_t urCommandBufferAppendMemBufferWriteRectExp( ur_exp_command_buffer_handle_t CommandBuffer, ur_mem_handle_t Buffer, ur_rect_offset_t BufferOffset, ur_rect_offset_t HostOffset, ur_rect_region_t Region, size_t BufferRowPitch, size_t BufferSlicePitch, size_t HostRowPitch, size_t HostSlicePitch, void *Src, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - ur_exp_command_buffer_sync_point_t *SyncPoint) { + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint, ur_event_handle_t *Event, + ur_exp_command_buffer_command_handle_t *Command) { + std::ignore = NumEventsInWaitList; + std::ignore = EventWaitList; + std::ignore = Event; + std::ignore = Command; std::scoped_lock Lock(Buffer->Mutex); char *ZeHandleDst = nullptr; UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - CommandBuffer->Device)); + CommandBuffer->Device, nullptr, 0u)); // Always prefer copy engine for writes bool PreferCopyEngine = true; @@ -1039,16 +959,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( SyncPointWaitList, SyncPoint); } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( +ur_result_t urCommandBufferAppendMemBufferReadExp( ur_exp_command_buffer_handle_t CommandBuffer, ur_mem_handle_t Buffer, size_t Offset, size_t Size, void *Dst, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - ur_exp_command_buffer_sync_point_t *SyncPoint) { + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint, ur_event_handle_t *Event, + ur_exp_command_buffer_command_handle_t *Command) { + std::ignore = NumEventsInWaitList; + std::ignore = EventWaitList; + std::ignore = Event; + std::ignore = Command; std::scoped_lock SrcLock(Buffer->Mutex); char *ZeHandleSrc = nullptr; UR_CALL(Buffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, - CommandBuffer->Device)); + CommandBuffer->Device, nullptr, 0u)); // Always prefer copy engine for reads bool PreferCopyEngine = true; @@ -1059,19 +985,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( SyncPoint); } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( +ur_result_t urCommandBufferAppendMemBufferReadRectExp( ur_exp_command_buffer_handle_t CommandBuffer, ur_mem_handle_t Buffer, ur_rect_offset_t BufferOffset, ur_rect_offset_t HostOffset, ur_rect_region_t Region, size_t BufferRowPitch, size_t BufferSlicePitch, size_t HostRowPitch, size_t HostSlicePitch, void *Dst, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - ur_exp_command_buffer_sync_point_t *SyncPoint) { + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint, ur_event_handle_t *Event, + ur_exp_command_buffer_command_handle_t *Command) { + std::ignore = NumEventsInWaitList; + std::ignore = EventWaitList; + std::ignore = Event; + std::ignore = Command; std::scoped_lock SrcLock(Buffer->Mutex); char *ZeHandleSrc; UR_CALL(Buffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, - CommandBuffer->Device)); + CommandBuffer->Device, nullptr, 0u)); // Always prefer copy engine for reads bool PreferCopyEngine = true; @@ -1083,11 +1015,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( +ur_result_t urCommandBufferAppendUSMPrefetchExp( ur_exp_command_buffer_handle_t CommandBuffer, const void *Mem, size_t Size, ur_usm_migration_flags_t Flags, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - ur_exp_command_buffer_sync_point_t *RetSyncPoint) { + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_exp_command_buffer_sync_point_t *RetSyncPoint, ur_event_handle_t *Event, + ur_exp_command_buffer_command_handle_t *Command) { + std::ignore = NumEventsInWaitList; + std::ignore = EventWaitList; + std::ignore = Event; + std::ignore = Command; std::ignore = Flags; if (CommandBuffer->IsInOrderCmdList) { @@ -1122,11 +1060,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( +ur_result_t urCommandBufferAppendUSMAdviseExp( ur_exp_command_buffer_handle_t CommandBuffer, const void *Mem, size_t Size, ur_usm_advice_flags_t Advice, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - ur_exp_command_buffer_sync_point_t *RetSyncPoint) { + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_exp_command_buffer_sync_point_t *RetSyncPoint, ur_event_handle_t *Event, + ur_exp_command_buffer_command_handle_t *Command) { + std::ignore = NumEventsInWaitList; + std::ignore = EventWaitList; + std::ignore = Event; + std::ignore = Command; // A memory chunk can be advised with muliple memory advices // We therefore prefer if statements to switch cases to combine all potential // flags @@ -1184,19 +1128,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( +ur_result_t urCommandBufferAppendMemBufferFillExp( ur_exp_command_buffer_handle_t CommandBuffer, ur_mem_handle_t Buffer, const void *Pattern, size_t PatternSize, size_t Offset, size_t Size, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - ur_exp_command_buffer_sync_point_t *SyncPoint) { + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint, ur_event_handle_t *Event, + ur_exp_command_buffer_command_handle_t *Command) { + std::ignore = NumEventsInWaitList; + std::ignore = EventWaitList; + std::ignore = Event; + std::ignore = Command; std::scoped_lock Lock(Buffer->Mutex); char *ZeHandleDst = nullptr; _ur_buffer *UrBuffer = reinterpret_cast<_ur_buffer *>(Buffer); UR_CALL(UrBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - CommandBuffer->Device)); + CommandBuffer->Device, nullptr, 0u)); return enqueueCommandBufferFillHelper( UR_COMMAND_MEM_BUFFER_FILL, CommandBuffer, ZeHandleDst + Offset, @@ -1205,12 +1155,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( Size, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( +ur_result_t urCommandBufferAppendUSMFillExp( ur_exp_command_buffer_handle_t CommandBuffer, void *Ptr, const void *Pattern, size_t PatternSize, size_t Size, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - ur_exp_command_buffer_sync_point_t *SyncPoint) { + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_exp_command_buffer_sync_point_t *SyncPoint, ur_event_handle_t *Event, + ur_exp_command_buffer_command_handle_t *Command) { + std::ignore = NumEventsInWaitList; + std::ignore = EventWaitList; + std::ignore = Event; + std::ignore = Command; return enqueueCommandBufferFillHelper( UR_COMMAND_MEM_BUFFER_FILL, CommandBuffer, Ptr, @@ -1219,8 +1175,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( Size, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } -namespace { - /** * Gets an L0 command queue that supports the chosen engine. * @param[in] Queue The UR queue used to submit the command buffer. @@ -1229,8 +1183,7 @@ namespace { * @param[out] ZeCommandQueue The L0 command queue. * @return UR_RESULT_SUCCESS or an error code on failure */ -ur_result_t getZeCommandQueue(ur_queue_handle_legacy_t Queue, - bool UseCopyEngine, +ur_result_t getZeCommandQueue(ur_queue_handle_t Queue, bool UseCopyEngine, ze_command_queue_handle_t &ZeCommandQueue) { auto &QGroup = Queue->getQueueGroup(UseCopyEngine); uint32_t QueueGroupOrdinal; @@ -1247,7 +1200,7 @@ ur_result_t getZeCommandQueue(ur_queue_handle_legacy_t Queue, * @return UR_RESULT_SUCCESS or an error code on failure */ ur_result_t waitForDependencies(ur_exp_command_buffer_handle_t CommandBuffer, - ur_queue_handle_legacy_t Queue, + ur_queue_handle_t Queue, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList) { const bool UseCopyEngine = false; @@ -1294,13 +1247,14 @@ ur_result_t waitForDependencies(ur_exp_command_buffer_handle_t CommandBuffer, * @param[in] CommandBuffer The command buffer. * @param[in] Queue The UR queue used to submit the command buffer. * @param[in] SignalCommandList The command-list to append the barrier to. - * @param[out] Event The host visible event which will be returned to the user. + * @param[out][optional] Event The host visible event which will be returned + * to the user, if user passed an output parameter to the UR API. * @return UR_RESULT_SUCCESS or an error code on failure */ ur_result_t createUserEvent(ur_exp_command_buffer_handle_t CommandBuffer, - ur_queue_handle_legacy_t Queue, + ur_queue_handle_t Queue, ur_command_list_ptr_t SignalCommandList, - ur_event_handle_t &Event) { + ur_event_handle_t *Event) { // Execution event for this enqueue of the UR command-buffer ur_event_handle_t RetEvent{}; @@ -1336,17 +1290,18 @@ ur_result_t createUserEvent(ur_exp_command_buffer_handle_t CommandBuffer, &(CommandBuffer->SignalEvent->ZeEvent))); } - Event = RetEvent; + if (Event) { + *Event = RetEvent; + } return UR_RESULT_SUCCESS; } -} // namespace -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( - ur_exp_command_buffer_handle_t CommandBuffer, ur_queue_handle_t UrQueue, - uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, - ur_event_handle_t *Event) { - auto Queue = Legacy(UrQueue); +ur_result_t +urCommandBufferEnqueueExp(ur_exp_command_buffer_handle_t CommandBuffer, + ur_queue_handle_t Queue, uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList, + ur_event_handle_t *Event) { std::scoped_lock Lock(Queue->Mutex); ze_command_queue_handle_t ZeCommandQueue; @@ -1399,22 +1354,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( ZE2UR_CALL(zeCommandListAppendEventReset, (SignalCommandList->first, CommandBuffer->AllResetEvent->ZeEvent)); - if (Event) { - UR_CALL(createUserEvent(CommandBuffer, Queue, SignalCommandList, *Event)); - } + // Appends a wait on the main command-list signal and registers output Event + // parameter with signal command-list completing. + UR_CALL(createUserEvent(CommandBuffer, Queue, SignalCommandList, Event)); UR_CALL(Queue->executeCommandList(SignalCommandList, false, false)); return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferRetainCommandExp( +ur_result_t urCommandBufferRetainCommandExp( ur_exp_command_buffer_command_handle_t Command) { Command->RefCount.increment(); return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseCommandExp( +ur_result_t urCommandBufferReleaseCommandExp( ur_exp_command_buffer_command_handle_t Command) { if (!Command->RefCount.decrementAndTest()) return UR_RESULT_SUCCESS; @@ -1423,8 +1378,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseCommandExp( return UR_RESULT_SUCCESS; } -namespace { - /** * Validates contents of the update command description. * @param[in] Command The command which is being updated. @@ -1441,27 +1394,15 @@ ur_result_t validateCommandDesc( ->mutableCommandFlags; logger::debug("Mutable features supported by device {}", SupportedFeatures); - uint32_t Dim = CommandDesc->newWorkDim; - if (Dim != 0) { - // Error if work dim changes - if (Dim != Command->WorkDim) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } - - // Error If Local size and not global size - if ((CommandDesc->pNewLocalWorkSize != nullptr) && - (CommandDesc->pNewGlobalWorkSize == nullptr)) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } - - // Error if local size non-nullptr and created with null - // or if local size nullptr and created with non-null - const bool IsNewLocalSizeNull = CommandDesc->pNewLocalWorkSize == nullptr; - const bool IsOriginalLocalSizeNull = !Command->UserDefinedLocalSize; + // Kernel handle updates are not yet supported. + if (CommandDesc->hNewKernel != Command->Kernel) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } - if (IsNewLocalSizeNull ^ IsOriginalLocalSizeNull) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } + if (CommandDesc->newWorkDim != Command->WorkDim && + (!CommandDesc->pNewGlobalWorkOffset || + !CommandDesc->pNewGlobalWorkSize)) { + return UR_RESULT_ERROR_INVALID_VALUE; } // Check if new global offset is provided. @@ -1469,7 +1410,7 @@ ur_result_t validateCommandDesc( UR_ASSERT(!NewGlobalWorkOffset || (SupportedFeatures & ZE_MUTABLE_COMMAND_EXP_FLAG_GLOBAL_OFFSET), UR_RESULT_ERROR_UNSUPPORTED_FEATURE); - if (NewGlobalWorkOffset && Dim > 0) { + if (NewGlobalWorkOffset) { if (!CommandBuffer->Context->getPlatform() ->ZeDriverGlobalOffsetExtensionFound) { logger::error("No global offset extension found on this driver"); @@ -1578,8 +1519,8 @@ ur_result_t updateKernelCommand( uint32_t WG[3]; UR_CALL(calculateKernelWorkDimensions( - Command->Kernel, CommandBuffer->Device, ZeThreadGroupDimensions, WG, - Dim, NewGlobalWorkSize, NewLocalWorkSize)); + Command->Kernel->ZeKernel, CommandBuffer->Device, + ZeThreadGroupDimensions, WG, Dim, NewGlobalWorkSize, NewLocalWorkSize)); auto MutableGroupCountDesc = std::make_unique>(); @@ -1645,7 +1586,7 @@ ur_result_t updateKernelCommand( char **ZeHandlePtr = nullptr; if (NewMemObjArg) { UR_CALL(NewMemObjArg->getZeHandlePtr(ZeHandlePtr, UrAccessMode, - CommandBuffer->Device)); + CommandBuffer->Device, nullptr, 0u)); } auto ZeMutableArgDesc = @@ -1734,21 +1675,18 @@ ur_result_t updateKernelCommand( return UR_RESULT_SUCCESS; } -} // namespace -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( +ur_result_t urCommandBufferUpdateKernelLaunchExp( ur_exp_command_buffer_command_handle_t Command, const ur_exp_command_buffer_update_kernel_launch_desc_t *CommandDesc) { + UR_ASSERT(Command->CommandBuffer->IsUpdatable, + UR_RESULT_ERROR_INVALID_OPERATION); UR_ASSERT(Command->Kernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(CommandDesc->newWorkDim <= 3, - UR_RESULT_ERROR_INVALID_WORK_DIMENSION); // Lock command, kernel and command buffer for update. std::scoped_lock Guard( Command->Mutex, Command->CommandBuffer->Mutex, Command->Kernel->Mutex); - UR_ASSERT(Command->CommandBuffer->IsUpdatable, - UR_RESULT_ERROR_INVALID_OPERATION); UR_ASSERT(Command->CommandBuffer->IsFinalized, UR_RESULT_ERROR_INVALID_OPERATION); @@ -1767,15 +1705,42 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferGetInfoExp( - ur_exp_command_buffer_handle_t hCommandBuffer, - ur_exp_command_buffer_info_t propName, size_t propSize, void *pPropValue, - size_t *pPropSizeRet) { +ur_result_t urCommandBufferUpdateSignalEventExp( + ur_exp_command_buffer_command_handle_t Command, ur_event_handle_t *Event) { + std::ignore = Command; + std::ignore = Event; + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urCommandBufferUpdateWaitEventsExp( + ur_exp_command_buffer_command_handle_t Command, + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList) { + std::ignore = Command; + std::ignore = NumEventsInWaitList; + std::ignore = EventWaitList; + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t +urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer, + ur_exp_command_buffer_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet) { UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); switch (propName) { case UR_EXP_COMMAND_BUFFER_INFO_REFERENCE_COUNT: return ReturnValue(uint32_t{hCommandBuffer->RefCount.load()}); + case UR_EXP_COMMAND_BUFFER_INFO_DESCRIPTOR: { + ur_exp_command_buffer_desc_t Descriptor{}; + Descriptor.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC; + Descriptor.pNext = nullptr; + Descriptor.isUpdatable = hCommandBuffer->IsUpdatable; + Descriptor.isInOrder = hCommandBuffer->IsInOrderCmdList; + Descriptor.enableProfiling = hCommandBuffer->IsProfilingEnabled; + + return ReturnValue(Descriptor); + } default: assert(!"Command-buffer info request not implemented"); } @@ -1783,10 +1748,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferGetInfoExp( return UR_RESULT_ERROR_INVALID_ENUMERATION; } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCommandGetInfoExp( - ur_exp_command_buffer_command_handle_t Command, - ur_exp_command_buffer_command_info_t PropName, size_t PropSize, - void *PropValue, size_t *PropSizeRet) { +ur_result_t +urCommandBufferCommandGetInfoExp(ur_exp_command_buffer_command_handle_t Command, + ur_exp_command_buffer_command_info_t PropName, + size_t PropSize, void *PropValue, + size_t *PropSizeRet) { UrReturnHelper ReturnValue(PropSize, PropValue, PropSizeRet); switch (PropName) { @@ -1798,3 +1764,5 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCommandGetInfoExp( return UR_RESULT_ERROR_INVALID_ENUMERATION; } + +} // namespace ur::level_zero diff --git a/source/adapters/level_zero/command_buffer.hpp b/source/adapters/level_zero/command_buffer.hpp index df539354cb..c86c6f5ba7 100644 --- a/source/adapters/level_zero/command_buffer.hpp +++ b/source/adapters/level_zero/command_buffer.hpp @@ -17,6 +17,7 @@ #include "common.hpp" #include "context.hpp" +#include "kernel.hpp" #include "queue.hpp" struct command_buffer_profiling_t { @@ -34,8 +35,6 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { ur_event_handle_t WaitEvent, ur_event_handle_t AllResetEvent, const ur_exp_command_buffer_desc_t *Desc, const bool IsInOrderCmdList); - ~ur_exp_command_buffer_handle_t_(); - void registerSyncPoint(ur_exp_command_buffer_sync_point_t SyncPoint, ur_event_handle_t Event); @@ -65,6 +64,10 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { */ ze_command_list_handle_t chooseCommandList(bool PreferCopyEngine); + // Releases the resources associated with the command-buffer before the + // command-buffer object is destroyed. + void cleanupCommandBufferResources(); + // UR context associated with this command-buffer ur_context_handle_t Context; // Device associated with this command buffer diff --git a/source/adapters/level_zero/common.cpp b/source/adapters/level_zero/common.cpp index 9dfb5a2b19..f5d8b20014 100644 --- a/source/adapters/level_zero/common.cpp +++ b/source/adapters/level_zero/common.cpp @@ -67,28 +67,6 @@ ur_result_t ze2urResult(ze_result_t ZeResult) { } } -/// Checks the version of the level-zero driver. -/// @param ZeDriver Level Zero Driver handle -/// @param VersionMajor Major verion number to compare to. -/// @param VersionMinor Minor verion number to compare to. -/// @param VersionBuild Build verion number to compare to. -/// @return true is the version of the driver is higher than or equal to the -/// compared version -bool isDriverVersionNewerOrSimilar(ze_driver_handle_t ZeDriver, - uint32_t VersionMajor, uint32_t VersionMinor, - uint32_t VersionBuild) { - ZeStruct ZeDriverProperties; - ZE2UR_CALL(zeDriverGetProperties, (ZeDriver, &ZeDriverProperties)); - uint32_t DriverVersion = ZeDriverProperties.driverVersion; - auto DriverVersionMajor = (DriverVersion & 0xFF000000) >> 24; - auto DriverVersionMinor = (DriverVersion & 0x00FF0000) >> 16; - auto DriverVersionBuild = DriverVersion & 0x0000FFFF; - - return ((DriverVersionMajor >= VersionMajor) && - (DriverVersionMinor >= VersionMinor) && - (DriverVersionBuild >= VersionBuild)); -} - // This function will ensure compatibility with both Linux and Windows for // setting environment variables. bool setEnvVar(const char *name, const char *value) { @@ -167,11 +145,14 @@ ze_result_t ZeCall::doCall(ze_result_t ZeResult, const char *ZeName, const char *ZeArgs, bool TraceError) { logger::debug("ZE ---> {}{}", ZeName, ZeArgs); - if (UrL0LeaksDebug) { - ++(*ZeCallCount)[ZeName]; + if (ZeResult == ZE_RESULT_SUCCESS) { + if (UrL0LeaksDebug) { + ++(*ZeCallCount)[ZeName]; + } + return ZE_RESULT_SUCCESS; } - if (ZeResult && TraceError) { + if (TraceError) { const char *ErrorString = "Unknown"; zeParseError(ZeResult, ErrorString); logger::error("Error ({}) in {}", ErrorString, ZeName); diff --git a/source/adapters/level_zero/common.hpp b/source/adapters/level_zero/common.hpp index a81b852727..6dd8a614c5 100644 --- a/source/adapters/level_zero/common.hpp +++ b/source/adapters/level_zero/common.hpp @@ -19,12 +19,14 @@ #include #include -#include +#include #include #include #include +#include "logger/ur_logger.hpp" + struct _ur_platform_handle_t; static auto getUrResultString = [](ur_result_t Result) { @@ -168,7 +170,7 @@ static auto getUrResultString = [](ur_result_t Result) { } }; -// Trace an internal PI call; returns in case of an error. +// Trace an internal UR call; returns in case of an error. #define UR_CALL(Call) \ { \ if (PrintTrace) \ @@ -180,6 +182,18 @@ static auto getUrResultString = [](ur_result_t Result) { return Result; \ } +// Trace an internal UR call; throw in case of an error. +#define UR_CALL_THROWS(Call) \ + { \ + if (PrintTrace) \ + logger::always("UR ---> {}", #Call); \ + ur_result_t Result = (Call); \ + if (PrintTrace) \ + logger::always("UR <--- {}({})", #Call, getUrResultString(Result)); \ + if (Result != UR_RESULT_SUCCESS) \ + throw Result; \ + } + // Controls UR L0 calls tracing. enum UrDebugLevel { UR_L0_DEBUG_NONE = 0x0, @@ -207,6 +221,15 @@ const int UrL0LeaksDebug = [] { return std::atoi(UrRet); }(); +// Enable for UR L0 Adapter to Init all L0 Drivers on the system with filtering +// in place for only currently used Drivers. +const int UrL0InitAllDrivers = [] { + const char *UrRet = std::getenv("UR_L0_INIT_ALL_DRIVERS"); + if (!UrRet) + return 0; + return std::atoi(UrRet); +}(); + // Controls Level Zero calls serialization to w/a Level Zero driver being not MT // ready. Recognized values (can be used as a bit mask): enum { @@ -317,11 +340,6 @@ bool setEnvVar(const char *name, const char *value); // Map Level Zero runtime error code to UR error code. ur_result_t ze2urResult(ze_result_t ZeResult); -/// Checks the version of the level-zero driver. -bool isDriverVersionNewerOrSimilar(ze_driver_handle_t ZeDriver, - uint32_t VersionMajor, uint32_t VersionMinor, - uint32_t VersionBuild); - // Trace a call to Level-Zero RT #define ZE2UR_CALL(ZeName, ZeArgs) \ { \ diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp index fc55b532b7..296e3e98d5 100644 --- a/source/adapters/level_zero/context.cpp +++ b/source/adapters/level_zero/context.cpp @@ -18,9 +18,9 @@ #include "queue.hpp" #include "ur_level_zero.hpp" -#include "v2/context.hpp" +namespace ur::level_zero { -UR_APIEXPORT ur_result_t UR_APICALL urContextCreate( +ur_result_t urContextCreate( uint32_t DeviceCount, ///< [in] the number of devices given in phDevices const ur_device_handle_t *Devices, ///< [in][range(0, DeviceCount)] array of handle of devices. @@ -38,7 +38,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextCreate( ZE2UR_CALL(zeContextCreate, (Platform->ZeDriver, &ContextDesc, &ZeContext)); try { ur_context_handle_t_ *Context = - new v2::ur_context_handle_t_(ZeContext, DeviceCount, Devices, true); + new ur_context_handle_t_(ZeContext, DeviceCount, Devices, true); Context->initialize(); *RetContext = reinterpret_cast(Context); @@ -55,7 +55,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextCreate( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urContextRetain( +ur_result_t urContextRetain( ur_context_handle_t Context ///< [in] handle of the context to get a reference of. ) { @@ -63,7 +63,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextRetain( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urContextRelease( +ur_result_t urContextRelease( ur_context_handle_t Context ///< [in] handle of the context to release. ) { ur_platform_handle_t Plt = Context->getPlatform(); @@ -87,7 +87,7 @@ static const bool UseMemcpy2DOperations = [] { return std::atoi(UseMemcpy2DOperationsFlag) > 0; }(); -UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( +ur_result_t urContextGetInfo( ur_context_handle_t Context, ///< [in] handle of the context ur_context_info_t ContextInfoType, ///< [in] type of the info to retrieve size_t PropSize, ///< [in] the number of bytes of memory pointed to by @@ -135,7 +135,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urContextGetNativeHandle( +ur_result_t urContextGetNativeHandle( ur_context_handle_t Context, ///< [in] handle of the context. ur_native_handle_t *NativeContext ///< [out] a pointer to the native ///< handle of the context. @@ -144,7 +144,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetNativeHandle( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle( +ur_result_t urContextCreateWithNativeHandle( ur_native_handle_t NativeContext, ///< [in] the native handle of the context. ur_adapter_handle_t, uint32_t NumDevices, const ur_device_handle_t *Devices, @@ -168,7 +168,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urContextSetExtendedDeleter( +ur_result_t urContextSetExtendedDeleter( ur_context_handle_t Context, ///< [in] handle of the context. ur_context_extended_deleter_t Deleter, ///< [in] Function pointer to extended deleter. @@ -182,6 +182,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextSetExtendedDeleter( "{} function not implemented!", __FUNCTION__); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +} // namespace ur::level_zero ur_result_t ur_context_handle_t_::initialize() { @@ -195,7 +196,7 @@ ur_result_t ur_context_handle_t_::initialize() { DeviceMemPools.emplace( std::piecewise_construct, std::make_tuple(Device->ZeDevice), std::make_tuple(umf::poolMakeUniqueFromOps( - &UMF_DISJOINT_POOL_OPS, std::move(MemProvider), + umfDisjointPoolOps(), std::move(MemProvider), &DisjointPoolConfigInstance .Configs[usm::DisjointPoolMemType::Device]) .second)); @@ -206,7 +207,7 @@ ur_result_t ur_context_handle_t_::initialize() { SharedMemPools.emplace( std::piecewise_construct, std::make_tuple(Device->ZeDevice), std::make_tuple(umf::poolMakeUniqueFromOps( - &UMF_DISJOINT_POOL_OPS, std::move(MemProvider), + umfDisjointPoolOps(), std::move(MemProvider), &DisjointPoolConfigInstance .Configs[usm::DisjointPoolMemType::Shared]) .second)); @@ -218,7 +219,7 @@ ur_result_t ur_context_handle_t_::initialize() { std::piecewise_construct, std::make_tuple(Device->ZeDevice), std::make_tuple( umf::poolMakeUniqueFromOps( - &UMF_DISJOINT_POOL_OPS, std::move(MemProvider), + umfDisjointPoolOps(), std::move(MemProvider), &DisjointPoolConfigInstance .Configs[usm::DisjointPoolMemType::SharedReadOnly]) .second)); @@ -271,7 +272,7 @@ ur_result_t ur_context_handle_t_::initialize() { .second; HostMemPool = umf::poolMakeUniqueFromOps( - &UMF_DISJOINT_POOL_OPS, std::move(MemProvider), + umfDisjointPoolOps(), std::move(MemProvider), &DisjointPoolConfigInstance.Configs[usm::DisjointPoolMemType::Host]) .second; @@ -511,7 +512,7 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool( // Create one event ZePool per MaxNumEventsPerPool events if (*ZePool == nullptr) { ze_event_pool_counter_based_exp_desc_t counterBasedExt = { - ZE_STRUCTURE_TYPE_COUNTER_BASED_EVENT_POOL_EXP_DESC}; + ZE_STRUCTURE_TYPE_COUNTER_BASED_EVENT_POOL_EXP_DESC, nullptr, 0}; ZeStruct ZeEventPoolDesc; ZeEventPoolDesc.count = MaxNumEventsPerPool; ZeEventPoolDesc.flags = 0; @@ -529,6 +530,8 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool( counterBasedExt.flags = ZE_EVENT_POOL_COUNTER_BASED_EXP_FLAG_NON_IMMEDIATE; } + logger::debug("ze_event_pool_desc_t counter based flags set to: {}", + counterBasedExt.flags); ZeEventPoolDesc.pNext = &counterBasedExt; } @@ -578,8 +581,8 @@ void ur_context_handle_t_::addEventToContextCache(ur_event_handle_t Event) { std::scoped_lock Lock(EventCacheMutex); ur_device_handle_t Device = nullptr; - if (!Event->IsMultiDevice && Legacy(Event->UrQueue)) { - Device = Legacy(Event->UrQueue)->Device; + if (!Event->IsMultiDevice && Event->UrQueue) { + Device = Event->UrQueue->Device; } auto Cache = getEventCache(Event->isHostVisible(), @@ -600,10 +603,10 @@ ur_context_handle_t_::decrementUnreleasedEventsInPool(ur_event_handle_t Event) { ze_device_handle_t ZeDevice = nullptr; bool UsingImmediateCommandlists = - !Legacy(Event->UrQueue) || Legacy(Event->UrQueue)->UsingImmCmdLists; + !Event->UrQueue || Event->UrQueue->UsingImmCmdLists; - if (!Event->IsMultiDevice && Legacy(Event->UrQueue)) { - ZeDevice = Legacy(Event->UrQueue)->Device->ZeDevice; + if (!Event->IsMultiDevice && Event->UrQueue) { + ZeDevice = Event->UrQueue->Device->ZeDevice; } std::list *ZePoolCache = getZeEventPoolCache( @@ -646,7 +649,7 @@ static const size_t CmdListsCleanupThreshold = [] { // Retrieve an available command list to be used in a PI call. ur_result_t ur_context_handle_t_::getAvailableCommandList( - ur_queue_handle_legacy_t Queue, ur_command_list_ptr_t &CommandList, + ur_queue_handle_t Queue, ur_command_list_ptr_t &CommandList, bool UseCopyEngine, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, bool AllowBatching, ze_command_queue_handle_t *ForcedCmdQueue) { @@ -769,9 +772,11 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList( CommandList = Queue->CommandListMap .emplace(ZeCommandList, - ur_command_list_info_t(ZeFence, true, false, - ZeCommandQueue, ZeQueueDesc, - Queue->useCompletionBatching())) + ur_command_list_info_t( + ZeFence, true, false, ZeCommandQueue, ZeQueueDesc, + Queue->useCompletionBatching(), true, + ZeCommandListIt->second.InOrderList, + ZeCommandListIt->second.IsImmediate)) .first; } ZeCommandListCache.erase(ZeCommandListIt); @@ -831,3 +836,12 @@ bool ur_context_handle_t_::isValidDevice(ur_device_handle_t Device) const { } return false; } + +const std::vector & +ur_context_handle_t_::getDevices() const { + return Devices; +} + +ze_context_handle_t ur_context_handle_t_::getZeHandle() const { + return ZeContext; +} diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp index dc70a2470c..e7c0d784a0 100644 --- a/source/adapters/level_zero/context.hpp +++ b/source/adapters/level_zero/context.hpp @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include @@ -100,6 +100,9 @@ struct ur_context_handle_t_ : _ur_object { l0_command_list_cache_info>>> ZeCopyCommandListCache; + std::unordered_map> + P2PDeviceCache; + // Store USM pool for USM shared and device allocations. There is 1 memory // pool per each pair of (context, device) per each memory type. std::unordered_map @@ -193,6 +196,9 @@ struct ur_context_handle_t_ : _ur_object { // Return the Platform, which is the same for all devices in the context ur_platform_handle_t getPlatform() const; + // Get vector of devices from this context + const std::vector &getDevices() const; + // Get index of the free slot in the available pool. If there is no available // pool then create new one. The HostVisible parameter tells if we need a // slot for a host-visible event. The ProfilingEnabled tells is we need a @@ -294,7 +300,7 @@ struct ur_context_handle_t_ : _ur_object { // for executing on this device. Immediate commandlists are created only // once for each SYCL Queue and after that they are reused. ur_result_t getAvailableCommandList( - ur_queue_handle_legacy_t Queue, ur_command_list_ptr_t &CommandList, + ur_queue_handle_t Queue, ur_command_list_ptr_t &CommandList, bool UseCopyEngine, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, bool AllowBatching = false, ze_command_queue_handle_t *ForcedCmdQueue = nullptr); @@ -303,6 +309,9 @@ struct ur_context_handle_t_ : _ur_object { // For that the Device or its root devices need to be in the context. bool isValidDevice(ur_device_handle_t Device) const; + // Get handle to the L0 context + ze_context_handle_t getZeHandle() const; + private: // Get the cache of events for a provided scope and profiling mode. auto getEventCache(bool HostVisible, bool WithProfiling, diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index 086c92b19a..866fd0e15f 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -10,13 +10,59 @@ #include "device.hpp" #include "adapter.hpp" #include "logger/ur_logger.hpp" +#include "ur_interface_loader.hpp" #include "ur_level_zero.hpp" #include "ur_util.hpp" #include #include #include -UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet( +// UR_L0_USE_COPY_ENGINE can be set to an integer value, or +// a pair of integer values of the form "lower_index:upper_index". +// Here, the indices point to copy engines in a list of all available copy +// engines. +// This functions returns this pair of indices. +// If the user specifies only a single integer, a value of 0 indicates that +// the copy engines will not be used at all. A value of 1 indicates that all +// available copy engines can be used. +const std::pair +getRangeOfAllowedCopyEngines(const ur_device_handle_t &Device) { + const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE"); + const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE"); + static const char *EnvVar = UrRet ? UrRet : (PiRet ? PiRet : nullptr); + // If the environment variable is not set, no copy engines are used when + // immediate commandlists are being used. For standard commandlists all are + // used. + if (!EnvVar) { + if (Device->ImmCommandListUsed) + return std::pair(0, 0); // Only main copy engine will be used. + return std::pair(0, INT_MAX); // All copy engines will be used. + } + std::string CopyEngineRange = EnvVar; + // Environment variable can be a single integer or a pair of integers + // separated by ":" + auto pos = CopyEngineRange.find(":"); + if (pos == std::string::npos) { + bool UseCopyEngine = (std::stoi(CopyEngineRange) != 0); + if (UseCopyEngine) + return std::pair(0, INT_MAX); // All copy engines can be used. + return std::pair(-1, -1); // No copy engines will be used. + } + int LowerCopyEngineIndex = std::stoi(CopyEngineRange.substr(0, pos)); + int UpperCopyEngineIndex = std::stoi(CopyEngineRange.substr(pos + 1)); + if ((LowerCopyEngineIndex > UpperCopyEngineIndex) || + (LowerCopyEngineIndex < -1) || (UpperCopyEngineIndex < -1)) { + logger::error("UR_L0_LEVEL_ZERO_USE_COPY_ENGINE: invalid value provided, " + "default set."); + LowerCopyEngineIndex = 0; + UpperCopyEngineIndex = INT_MAX; + } + return std::pair(LowerCopyEngineIndex, UpperCopyEngineIndex); +} + +namespace ur::level_zero { + +ur_result_t urDeviceGet( ur_platform_handle_t Platform, ///< [in] handle of the platform instance ur_device_type_t DeviceType, ///< [in] the type of the devices. uint32_t NumEntries, ///< [in] the number of devices to be added to @@ -143,7 +189,7 @@ uint64_t calculateGlobalMemSize(ur_device_handle_t Device) { return Device->ZeGlobalMemSize.operator->()->value; } -UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( +ur_result_t urDeviceGetInfo( ur_device_handle_t Device, ///< [in] handle of the device instance ur_device_info_t ParamName, ///< [in] type of the info to retrieve size_t propSize, ///< [in] the number of bytes pointed to by ParamValue. @@ -701,11 +747,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( } case UR_DEVICE_INFO_GLOBAL_MEM_FREE: { - if (getenv("ZES_ENABLE_SYSMAN") == nullptr) { - setErrorMessage("Set ZES_ENABLE_SYSMAN=1 to obtain free memory", - UR_RESULT_ERROR_UNINITIALIZED, - static_cast(ZE_RESULT_ERROR_UNINITIALIZED)); - return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + bool SysManEnv = getenv_tobool("ZES_ENABLE_SYSMAN", false); + if ((Device->Platform->ZedeviceToZesDeviceMap.size() == 0) && !SysManEnv) { + logger::error("SysMan support is unavailable on this system. Please " + "check your level zero driver installation."); + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; } // Calculate the global memory size as the max limit that can be reported as // "free" memory for the user to allocate. @@ -714,30 +760,57 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( // Currently this is only the one enumerated with ordinal 0. uint64_t FreeMemory = 0; uint32_t MemCount = 0; - ZE2UR_CALL(zesDeviceEnumMemoryModules, (ZeDevice, &MemCount, nullptr)); + + zes_device_handle_t ZesDevice = Device->ZeDevice; + struct ur_zes_device_handle_data_t ZesDeviceData = {}; + // If legacy sysman is enabled thru the environment variable, then zesInit + // will fail, but sysman is still usable so go the legacy route. + if (!SysManEnv) { + auto It = Device->Platform->ZedeviceToZesDeviceMap.find(Device->ZeDevice); + if (It == Device->Platform->ZedeviceToZesDeviceMap.end()) { + // no matching device + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + } else { + ZesDeviceData = + Device->Platform->ZedeviceToZesDeviceMap[Device->ZeDevice]; + ZesDevice = ZesDeviceData.ZesDevice; + } + } + + ZE2UR_CALL(zesDeviceEnumMemoryModules, (ZesDevice, &MemCount, nullptr)); if (MemCount != 0) { std::vector ZesMemHandles(MemCount); ZE2UR_CALL(zesDeviceEnumMemoryModules, - (ZeDevice, &MemCount, ZesMemHandles.data())); + (ZesDevice, &MemCount, ZesMemHandles.data())); for (auto &ZesMemHandle : ZesMemHandles) { ZesStruct ZesMemProperties; ZE2UR_CALL(zesMemoryGetProperties, (ZesMemHandle, &ZesMemProperties)); // For root-device report memory from all memory modules since that // is what totally available in the default implicit scaling mode. // For sub-devices only report memory local to them. - if (!Device->isSubDevice() || Device->ZeDeviceProperties->subdeviceId == - ZesMemProperties.subdeviceId) { - - ZesStruct ZesMemState; - ZE2UR_CALL(zesMemoryGetState, (ZesMemHandle, &ZesMemState)); - FreeMemory += ZesMemState.free; + if (SysManEnv) { + if (!Device->isSubDevice() || + Device->ZeDeviceProperties->subdeviceId == + ZesMemProperties.subdeviceId) { + + ZesStruct ZesMemState; + ZE2UR_CALL(zesMemoryGetState, (ZesMemHandle, &ZesMemState)); + FreeMemory += ZesMemState.free; + } + } else { + if (ZesDeviceData.SubDeviceId == ZesMemProperties.subdeviceId || + !ZesDeviceData.SubDevice) { + ZesStruct ZesMemState; + ZE2UR_CALL(zesMemoryGetState, (ZesMemHandle, &ZesMemState)); + FreeMemory += ZesMemState.free; + } } } } if (MemCount > 0) { return ReturnValue(std::min(GlobalMemSize, FreeMemory)); } else { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; } } case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: { @@ -948,36 +1021,60 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( } case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: return ReturnValue(true); - case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: { - // Update support requires being able to update kernel arguments and all - // aspects of the kernel NDRange. - const ze_mutable_command_exp_flags_t UpdateMask = - ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS | - ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT | - ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE | - ZE_MUTABLE_COMMAND_EXP_FLAG_GLOBAL_OFFSET; - - const bool KernelArgUpdateSupport = - (Device->ZeDeviceMutableCmdListsProperties->mutableCommandFlags & - UpdateMask) == UpdateMask; - return ReturnValue(KernelArgUpdateSupport && - Device->Platform->ZeMutableCmdListExt.Supported); + case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP: { + const auto ZeMutableCommandFlags = + Device->ZeDeviceMutableCmdListsProperties->mutableCommandFlags; + + auto supportsFlags = [&](ze_mutable_command_exp_flags_t RequiredFlags) { + return (ZeMutableCommandFlags & RequiredFlags) == RequiredFlags; + }; + + ur_device_command_buffer_update_capability_flags_t UpdateCapabilities = 0; + if (supportsFlags(ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS)) { + UpdateCapabilities |= + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS; + } + /* These capabilities are bundled together because, when the user updates + * the global work-size, the implementation might have to generate a new + * local work-size. This would require both mutable command flags to be set + * even though only the global work-size was explicitly updated. */ + if (supportsFlags(ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT | + ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE)) { + UpdateCapabilities |= + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE; + } + if (supportsFlags(ZE_MUTABLE_COMMAND_EXP_FLAG_GLOBAL_OFFSET)) { + UpdateCapabilities |= + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET; + } + return ReturnValue(UpdateCapabilities); } + case UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP: + return ReturnValue(false); case UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP: { - // On L0 bindless images are supported. - return ReturnValue(true); + bool DeviceIsDG2OrNewer = + Device->ZeDeviceIpVersionExt->ipVersion >= 0x030dc000; + return ReturnValue(DeviceIsDG2OrNewer && + Device->ZeDeviceImageProperties->maxImageDims1D > 0 && + Device->ZeDeviceImageProperties->maxImageDims2D > 0 && + Device->ZeDeviceImageProperties->maxImageDims3D > 0); } case UR_DEVICE_INFO_BINDLESS_IMAGES_SHARED_USM_SUPPORT_EXP: { // On L0 bindless images can not be backed by shared (managed) USM. return ReturnValue(false); } case UR_DEVICE_INFO_BINDLESS_IMAGES_1D_USM_SUPPORT_EXP: { - // On L0 1D bindless image USM are supported. - return ReturnValue(true); + bool DeviceIsDG2OrNewer = + Device->ZeDeviceIpVersionExt->ipVersion >= 0x030dc000; + return ReturnValue(DeviceIsDG2OrNewer && + Device->ZeDeviceImageProperties->maxImageDims1D > 0); } case UR_DEVICE_INFO_BINDLESS_IMAGES_2D_USM_SUPPORT_EXP: { - // On L0 2D bindless image USM are supported. - return ReturnValue(true); + bool DeviceIsDG2OrNewer = + Device->ZeDeviceIpVersionExt->ipVersion >= 0x030dc000; + return ReturnValue(DeviceIsDG2OrNewer && + Device->ZeDeviceImageProperties->maxImageDims2D > 0); } case UR_DEVICE_INFO_IMAGE_PITCH_ALIGN_EXP: case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_WIDTH_EXP: @@ -1002,11 +1099,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( // L0 does not support creation of images from individual mipmap levels. return ReturnValue(false); } - case UR_DEVICE_INFO_INTEROP_MEMORY_IMPORT_SUPPORT_EXP: { + case UR_DEVICE_INFO_EXTERNAL_MEMORY_IMPORT_SUPPORT_EXP: { // L0 does not support importing external memory. return ReturnValue(false); } - case UR_DEVICE_INFO_INTEROP_SEMAPHORE_IMPORT_SUPPORT_EXP: { + case UR_DEVICE_INFO_EXTERNAL_SEMAPHORE_IMPORT_SUPPORT_EXP: { // L0 does not support importing external semaphores. return ReturnValue(false); } @@ -1068,153 +1165,348 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return UR_RESULT_SUCCESS; } -// UR_L0_USE_COPY_ENGINE can be set to an integer value, or -// a pair of integer values of the form "lower_index:upper_index". -// Here, the indices point to copy engines in a list of all available copy -// engines. -// This functions returns this pair of indices. -// If the user specifies only a single integer, a value of 0 indicates that -// the copy engines will not be used at all. A value of 1 indicates that all -// available copy engines can be used. -const std::pair -getRangeOfAllowedCopyEngines(const ur_device_handle_t &Device) { - const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE"); - const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE"); - static const char *EnvVar = UrRet ? UrRet : (PiRet ? PiRet : nullptr); - // If the environment variable is not set, no copy engines are used when - // immediate commandlists are being used. For standard commandlists all are - // used. - if (!EnvVar) { - if (Device->ImmCommandListUsed) - return std::pair(0, 0); // Only main copy engine will be used. - return std::pair(0, INT_MAX); // All copy engines will be used. - } - std::string CopyEngineRange = EnvVar; - // Environment variable can be a single integer or a pair of integers - // separated by ":" - auto pos = CopyEngineRange.find(":"); - if (pos == std::string::npos) { - bool UseCopyEngine = (std::stoi(CopyEngineRange) != 0); - if (UseCopyEngine) - return std::pair(0, INT_MAX); // All copy engines can be used. - return std::pair(-1, -1); // No copy engines will be used. - } - int LowerCopyEngineIndex = std::stoi(CopyEngineRange.substr(0, pos)); - int UpperCopyEngineIndex = std::stoi(CopyEngineRange.substr(pos + 1)); - if ((LowerCopyEngineIndex > UpperCopyEngineIndex) || - (LowerCopyEngineIndex < -1) || (UpperCopyEngineIndex < -1)) { - logger::error("UR_L0_LEVEL_ZERO_USE_COPY_ENGINE: invalid value provided, " - "default set."); - LowerCopyEngineIndex = 0; - UpperCopyEngineIndex = INT_MAX; - } - return std::pair(LowerCopyEngineIndex, UpperCopyEngineIndex); -} - bool CopyEngineRequested(const ur_device_handle_t &Device) { int LowerCopyQueueIndex = getRangeOfAllowedCopyEngines(Device).first; int UpperCopyQueueIndex = getRangeOfAllowedCopyEngines(Device).second; return ((LowerCopyQueueIndex != -1) || (UpperCopyQueueIndex != -1)); } -// Whether immediate commandlists will be used for kernel launches and copies. -// The default is standard commandlists. Setting 1 or 2 specifies use of -// immediate commandlists. Note: when immediate commandlists are used then -// device-only events must be either AllHostVisible or OnDemandHostVisibleProxy. -// (See env var UR_L0_DEVICE_SCOPE_EVENTS). - -// Get value of immediate commandlists env var setting or -1 if unset -ur_device_handle_t_::ImmCmdlistMode -ur_device_handle_t_::useImmediateCommandLists() { - // If immediate commandlist setting is not explicitly set, then use the device - // default. - // TODO: confirm this is good once make_queue revert is added - static const int ImmediateCommandlistsSetting = [] { - const char *UrRet = std::getenv("UR_L0_USE_IMMEDIATE_COMMANDLISTS"); - const char *PiRet = - std::getenv("SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS"); - const char *ImmediateCommandlistsSettingStr = - UrRet ? UrRet : (PiRet ? PiRet : nullptr); - if (!ImmediateCommandlistsSettingStr) - return -1; - return std::atoi(ImmediateCommandlistsSettingStr); - }(); - - if (ImmediateCommandlistsSetting == -1) - // Change this to PerQueue as default after more testing. -#ifdef _WIN32 - return NotUsed; -#else - return isPVC() ? PerQueue : NotUsed; -#endif - switch (ImmediateCommandlistsSetting) { - case 0: - return NotUsed; - case 1: - return PerQueue; - case 2: - return PerThreadPerQueue; - default: - return NotUsed; +ur_result_t urDevicePartition( + ur_device_handle_t Device, ///< [in] handle of the device to partition. + const ur_device_partition_properties_t + *Properties, ///< [in] Device partition properties. + uint32_t NumDevices, ///< [in] the number of sub-devices. + ur_device_handle_t + *OutDevices, ///< [out][optional][range(0, NumDevices)] array of handle + ///< of devices. If NumDevices is less than the number of + ///< sub-devices available, then the function shall only + ///< retrieve that number of sub-devices. + uint32_t *NumDevicesRet ///< [out][optional] pointer to the number of + ///< sub-devices the device can be partitioned into + ///< according to the partitioning property. +) { + // Other partitioning ways are not supported by Level Zero + UR_ASSERT(Properties->PropCount == 1, UR_RESULT_ERROR_INVALID_VALUE); + if (Properties->pProperties->type == UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN) { + if ((Properties->pProperties->value.affinity_domain != + UR_DEVICE_AFFINITY_DOMAIN_FLAG_NEXT_PARTITIONABLE && + Properties->pProperties->value.affinity_domain != + UR_DEVICE_AFFINITY_DOMAIN_FLAG_NUMA)) { + return UR_RESULT_ERROR_INVALID_VALUE; + } + } else if (Properties->pProperties->type == UR_DEVICE_PARTITION_BY_CSLICE) { + if (Properties->pProperties->value.affinity_domain != 0) { + return UR_RESULT_ERROR_INVALID_VALUE; + } + } else { + return UR_RESULT_ERROR_INVALID_VALUE; } -} - -bool ur_device_handle_t_::useRelaxedAllocationLimits() { - static const bool EnableRelaxedAllocationLimits = [] { - auto UrRet = ur_getenv("UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS"); - const bool RetVal = UrRet ? std::stoi(*UrRet) : 0; - return RetVal; - }(); - return EnableRelaxedAllocationLimits; -} + // Devices cache is normally created in piDevicesGet but still make + // sure that cache is populated. + // + auto Res = Device->Platform->populateDeviceCacheIfNeeded(); + if (Res != UR_RESULT_SUCCESS) { + return Res; + } -bool ur_device_handle_t_::useDriverInOrderLists() { - // Use in-order lists implementation from L0 driver instead - // of adapter's implementation. + auto EffectiveNumDevices = [&]() -> decltype(Device->SubDevices.size()) { + if (Device->SubDevices.size() == 0) + return 0; - ze_driver_handle_t ZeDriver = this->Platform->ZeDriver; + // Sub-Sub-Devices are partitioned by CSlices, not by affinity domain. + // However, if + // UR_L0_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING overrides that + // still expose CSlices in partitioning by affinity domain for compatibility + // reasons. + if (Properties->pProperties->type == + UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN && + !ExposeCSliceInAffinityPartitioning) { + if (Device->isSubDevice()) { + return 0; + } + } + if (Properties->pProperties->type == UR_DEVICE_PARTITION_BY_CSLICE) { + // Not a CSlice-based partitioning. + if (!Device->SubDevices[0]->isCCS()) { + return 0; + } + } - static const bool UseDriverInOrderLists = [&] { - const char *UrRet = std::getenv("UR_L0_USE_DRIVER_INORDER_LISTS"); - bool CompatibleDriver = isDriverVersionNewerOrSimilar( - ZeDriver, 1, 3, L0_DRIVER_INORDER_MIN_VERSION); - if (!UrRet) - return CompatibleDriver; - return std::atoi(UrRet) != 0; + return Device->SubDevices.size(); }(); - return UseDriverInOrderLists; -} - -ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal, - int SubSubDeviceIndex) { - // Maintain various device properties cache. - // Note that we just describe here how to compute the data. - // The real initialization is upon first access. + // TODO: Consider support for partitioning to <= total sub-devices. + // Currently supported partitioning (by affinity domain/numa) would always + // partition to all sub-devices. // - auto ZeDevice = this->ZeDevice; - ZeDeviceProperties.Compute = [ZeDevice](ze_device_properties_t &Properties) { - ZE_CALL_NOCHECK(zeDeviceGetProperties, (ZeDevice, &Properties)); - }; + if (NumDevices != 0) + UR_ASSERT(NumDevices == EffectiveNumDevices, UR_RESULT_ERROR_INVALID_VALUE); - ZeDeviceComputeProperties.Compute = - [ZeDevice](ze_device_compute_properties_t &Properties) { - ZE_CALL_NOCHECK(zeDeviceGetComputeProperties, (ZeDevice, &Properties)); - }; + for (uint32_t I = 0; I < NumDevices; I++) { + auto prop = Properties->pProperties[0]; + if (prop.type == UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN) { + // In case the value is NEXT_PARTITIONABLE, we need to change it to the + // chosen domain. This will always be NUMA since that's the only domain + // supported by level zero. + prop.value.affinity_domain = UR_DEVICE_AFFINITY_DOMAIN_FLAG_NUMA; + } + Device->SubDevices[I]->SubDeviceCreationProperty = prop; - ZeDeviceIpVersionExt.Compute = - [ZeDevice](ze_device_ip_version_ext_t &Properties) { - ze_device_properties_t P; - P.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; - P.pNext = (void *)&Properties; - ZE_CALL_NOCHECK(zeDeviceGetProperties, (ZeDevice, &P)); - }; + OutDevices[I] = Device->SubDevices[I]; + // reusing the same pi_device needs to increment the reference count + ur::level_zero::urDeviceRetain(OutDevices[I]); + } - ZeDeviceImageProperties.Compute = - [ZeDevice](ze_device_image_properties_t &Properties) { - ZE_CALL_NOCHECK(zeDeviceGetImageProperties, (ZeDevice, &Properties)); - }; + if (NumDevicesRet) { + *NumDevicesRet = EffectiveNumDevices; + } + return UR_RESULT_SUCCESS; +} + +ur_result_t urDeviceSelectBinary( + ur_device_handle_t + Device, ///< [in] handle of the device to select binary for. + const ur_device_binary_t + *Binaries, ///< [in] the array of binaries to select from. + uint32_t NumBinaries, ///< [in] the number of binaries passed in ppBinaries. + ///< Must greater than or equal to zero otherwise + ///< ::UR_RESULT_ERROR_INVALID_VALUE is returned. + uint32_t + *SelectedBinary ///< [out] the index of the selected binary in the input + ///< array of binaries. If a suitable binary was not + ///< found the function returns ${X}_INVALID_BINARY. +) { + std::ignore = Device; + // TODO: this is a bare-bones implementation for choosing a device image + // that would be compatible with the targeted device. An AOT-compiled + // image is preferred over SPIR-V for known devices (i.e. Intel devices) + // The implementation makes no effort to differentiate between multiple images + // for the given device, and simply picks the first one compatible. + // + // Real implementation will use the same mechanism OpenCL ICD dispatcher + // uses. Something like: + // PI_VALIDATE_HANDLE_RETURN_HANDLE(ctx, PI_ERROR_INVALID_CONTEXT); + // return context->dispatch->piextDeviceSelectIR( + // ctx, images, num_images, selected_image); + // where context->dispatch is set to the dispatch table provided by PI + // plugin for platform/device the ctx was created for. + + // Look for GEN binary, which we known can only be handled by Level-Zero now. + const char *BinaryTarget = + UR_DEVICE_BINARY_TARGET_SPIRV64_GEN; // UR_DEVICE_BINARY_TARGET_SPIRV64_GEN; + + uint32_t *SelectedBinaryInd = SelectedBinary; + + // Find the appropriate device image, fallback to spirv if not found + constexpr uint32_t InvalidInd = (std::numeric_limits::max)(); + uint32_t Spirv = InvalidInd; + + for (uint32_t i = 0; i < NumBinaries; ++i) { + if (strcmp(Binaries[i].pDeviceTargetSpec, BinaryTarget) == 0) { + *SelectedBinaryInd = i; + return UR_RESULT_SUCCESS; + } + if (strcmp(Binaries[i].pDeviceTargetSpec, + UR_DEVICE_BINARY_TARGET_SPIRV64) == 0) + Spirv = i; + } + // Points to a spirv image, if such indeed was found + if ((*SelectedBinaryInd = Spirv) != InvalidInd) + return UR_RESULT_SUCCESS; + + // No image can be loaded for the given device + return UR_RESULT_ERROR_INVALID_BINARY; +} + +ur_result_t urDeviceGetNativeHandle( + ur_device_handle_t Device, ///< [in] handle of the device. + ur_native_handle_t + *NativeDevice ///< [out] a pointer to the native handle of the device. +) { + *NativeDevice = reinterpret_cast(Device->ZeDevice); + return UR_RESULT_SUCCESS; +} + +ur_result_t urDeviceCreateWithNativeHandle( + ur_native_handle_t NativeDevice, ///< [in] the native handle of the device. + [[maybe_unused]] ur_adapter_handle_t + Adapter, ///< [in] handle of the platform instance + [[maybe_unused]] const ur_device_native_properties_t + *Properties, ///< [in][optional] pointer to native device properties + ///< struct. + ur_device_handle_t + *Device ///< [out] pointer to the handle of the device object created. +) { + auto ZeDevice = ur_cast(NativeDevice); + + // The SYCL spec requires that the set of devices must remain fixed for the + // duration of the application's execution. We assume that we found all of the + // Level Zero devices when we initialized the platforms/devices cache, so the + // "NativeHandle" must already be in the cache. If it is not, this must not be + // a valid Level Zero device. + + ur_device_handle_t Dev = nullptr; + if (const auto *platforms = GlobalAdapter->PlatformCache->get_value()) { + for (const auto &p : *platforms) { + Dev = p->getDeviceFromNativeHandle(ZeDevice); + } + } else { + return GlobalAdapter->PlatformCache->get_error(); + } + + if (Dev == nullptr) + return UR_RESULT_ERROR_INVALID_VALUE; + + *Device = Dev; + return UR_RESULT_SUCCESS; +} + +ur_result_t urDeviceGetGlobalTimestamps( + ur_device_handle_t Device, ///< [in] handle of the device instance + uint64_t *DeviceTimestamp, ///< [out][optional] pointer to the Device's + ///< global timestamp that correlates with the + ///< Host's global timestamp value + uint64_t *HostTimestamp ///< [out][optional] pointer to the Host's global + ///< timestamp that correlates with the Device's + ///< global timestamp value +) { + const uint64_t &ZeTimerResolution = + Device->ZeDeviceProperties->timerResolution; + const uint64_t TimestampMaxCount = Device->getTimestampMask(); + uint64_t DeviceClockCount, Dummy; + + ZE2UR_CALL(zeDeviceGetGlobalTimestamps, + (Device->ZeDevice, + HostTimestamp == nullptr ? &Dummy : HostTimestamp, + &DeviceClockCount)); + + if (DeviceTimestamp != nullptr) { + *DeviceTimestamp = + (DeviceClockCount & TimestampMaxCount) * ZeTimerResolution; + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t urDeviceRetain(ur_device_handle_t Device) { + // The root-device ref-count remains unchanged (always 1). + if (Device->isSubDevice()) { + Device->RefCount.increment(); + } + return UR_RESULT_SUCCESS; +} + +ur_result_t urDeviceRelease(ur_device_handle_t Device) { + // Root devices are destroyed during the piTearDown process. + if (Device->isSubDevice()) { + if (Device->RefCount.decrementAndTest()) { + delete Device; + } + } + + return UR_RESULT_SUCCESS; +} +} // namespace ur::level_zero + +// Whether immediate commandlists will be used for kernel launches and copies. +// The default is standard commandlists. Setting 1 or 2 specifies use of +// immediate commandlists. Note: when immediate commandlists are used then +// device-only events must be either AllHostVisible or OnDemandHostVisibleProxy. +// (See env var UR_L0_DEVICE_SCOPE_EVENTS). + +// Get value of immediate commandlists env var setting or -1 if unset +ur_device_handle_t_::ImmCmdlistMode +ur_device_handle_t_::useImmediateCommandLists() { + // If immediate commandlist setting is not explicitly set, then use the device + // default. + // TODO: confirm this is good once make_queue revert is added + static const int ImmediateCommandlistsSetting = [] { + const char *UrRet = std::getenv("UR_L0_USE_IMMEDIATE_COMMANDLISTS"); + const char *PiRet = + std::getenv("SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS"); + const char *ImmediateCommandlistsSettingStr = + UrRet ? UrRet : (PiRet ? PiRet : nullptr); + if (!ImmediateCommandlistsSettingStr) + return -1; + return std::atoi(ImmediateCommandlistsSettingStr); + }(); + + if (ImmediateCommandlistsSetting == -1) { + bool isDG2SupportedDriver = + this->Platform->isDriverVersionNewerOrSimilar(1, 5, 30820); + if ((isDG2SupportedDriver && isDG2()) || isPVC()) { + return PerQueue; + } else { + return NotUsed; + } + } + switch (ImmediateCommandlistsSetting) { + case 0: + return NotUsed; + case 1: + return PerQueue; + case 2: + return PerThreadPerQueue; + default: + return NotUsed; + } +} + +bool ur_device_handle_t_::useRelaxedAllocationLimits() { + static const bool EnableRelaxedAllocationLimits = [] { + auto UrRet = ur_getenv("UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS"); + const bool RetVal = UrRet ? std::stoi(*UrRet) : 0; + return RetVal; + }(); + + return EnableRelaxedAllocationLimits; +} + +bool ur_device_handle_t_::useDriverInOrderLists() { + // Use in-order lists implementation from L0 driver instead + // of adapter's implementation. + + static const bool UseDriverInOrderLists = [&] { + const char *UrRet = std::getenv("UR_L0_USE_DRIVER_INORDER_LISTS"); + bool CompatibleDriver = this->Platform->isDriverVersionNewerOrSimilar( + 1, 3, L0_DRIVER_INORDER_MIN_VERSION); + if (!UrRet) + return CompatibleDriver; + return std::atoi(UrRet) != 0; + }(); + + return UseDriverInOrderLists; +} + +ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal, + int SubSubDeviceIndex) { + // Maintain various device properties cache. + // Note that we just describe here how to compute the data. + // The real initialization is upon first access. + // + auto ZeDevice = this->ZeDevice; + ZeDeviceProperties.Compute = [ZeDevice](ze_device_properties_t &Properties) { + ZE_CALL_NOCHECK(zeDeviceGetProperties, (ZeDevice, &Properties)); + }; + + ZeDeviceComputeProperties.Compute = + [ZeDevice](ze_device_compute_properties_t &Properties) { + ZE_CALL_NOCHECK(zeDeviceGetComputeProperties, (ZeDevice, &Properties)); + }; + + ZeDeviceIpVersionExt.Compute = + [ZeDevice](ze_device_ip_version_ext_t &Properties) { + ze_device_properties_t P; + P.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; + P.pNext = (void *)&Properties; + ZE_CALL_NOCHECK(zeDeviceGetProperties, (ZeDevice, &P)); + }; + + ZeDeviceImageProperties.Compute = + [ZeDevice](ze_device_image_properties_t &Properties) { + ZE_CALL_NOCHECK(zeDeviceGetImageProperties, (ZeDevice, &Properties)); + }; ZeDeviceModuleProperties.Compute = [ZeDevice](ze_device_module_properties_t &Properties) { @@ -1314,7 +1606,7 @@ ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal, return UR_RESULT_ERROR_UNKNOWN; } - if (CopyEngineRequested((ur_device_handle_t)this)) { + if (ur::level_zero::CopyEngineRequested((ur_device_handle_t)this)) { for (uint32_t i = 0; i < numQueueGroups; i++) { if (((QueueGroupProperties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) && @@ -1355,26 +1647,6 @@ ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal, return UR_RESULT_SUCCESS; } -ur_result_t urDeviceRetain(ur_device_handle_t Device) { - - // The root-device ref-count remains unchanged (always 1). - if (Device->isSubDevice()) { - Device->RefCount.increment(); - } - return UR_RESULT_SUCCESS; -} - -ur_result_t urDeviceRelease(ur_device_handle_t Device) { - // Root devices are destroyed during the piTearDown process. - if (Device->isSubDevice()) { - if (Device->RefCount.decrementAndTest()) { - delete Device; - } - } - - return UR_RESULT_SUCCESS; -} - void ZeDriverVersionStringExtension::setZeDriverVersionString( ur_platform_handle_t_ *Platform) { // Check if Intel Driver Version String is available. If yes, save the API @@ -1442,227 +1714,3 @@ void ZeUSMImportExtension::doZeUSMRelease(ze_driver_handle_t DriverHandle, void *HostPtr) { ZE_CALL_NOCHECK(zexDriverReleaseImportedPointer, (DriverHandle, HostPtr)); } - -UR_APIEXPORT ur_result_t UR_APICALL urDevicePartition( - ur_device_handle_t Device, ///< [in] handle of the device to partition. - const ur_device_partition_properties_t - *Properties, ///< [in] Device partition properties. - uint32_t NumDevices, ///< [in] the number of sub-devices. - ur_device_handle_t - *OutDevices, ///< [out][optional][range(0, NumDevices)] array of handle - ///< of devices. If NumDevices is less than the number of - ///< sub-devices available, then the function shall only - ///< retrieve that number of sub-devices. - uint32_t *NumDevicesRet ///< [out][optional] pointer to the number of - ///< sub-devices the device can be partitioned into - ///< according to the partitioning property. -) { - // Other partitioning ways are not supported by Level Zero - UR_ASSERT(Properties->PropCount == 1, UR_RESULT_ERROR_INVALID_VALUE); - if (Properties->pProperties->type == UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN) { - if ((Properties->pProperties->value.affinity_domain != - UR_DEVICE_AFFINITY_DOMAIN_FLAG_NEXT_PARTITIONABLE && - Properties->pProperties->value.affinity_domain != - UR_DEVICE_AFFINITY_DOMAIN_FLAG_NUMA)) { - return UR_RESULT_ERROR_INVALID_VALUE; - } - } else if (Properties->pProperties->type == UR_DEVICE_PARTITION_BY_CSLICE) { - if (Properties->pProperties->value.affinity_domain != 0) { - return UR_RESULT_ERROR_INVALID_VALUE; - } - } else { - return UR_RESULT_ERROR_INVALID_VALUE; - } - - // Devices cache is normally created in piDevicesGet but still make - // sure that cache is populated. - // - auto Res = Device->Platform->populateDeviceCacheIfNeeded(); - if (Res != UR_RESULT_SUCCESS) { - return Res; - } - - auto EffectiveNumDevices = [&]() -> decltype(Device->SubDevices.size()) { - if (Device->SubDevices.size() == 0) - return 0; - - // Sub-Sub-Devices are partitioned by CSlices, not by affinity domain. - // However, if - // UR_L0_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING overrides that - // still expose CSlices in partitioning by affinity domain for compatibility - // reasons. - if (Properties->pProperties->type == - UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN && - !ExposeCSliceInAffinityPartitioning) { - if (Device->isSubDevice()) { - return 0; - } - } - if (Properties->pProperties->type == UR_DEVICE_PARTITION_BY_CSLICE) { - // Not a CSlice-based partitioning. - if (!Device->SubDevices[0]->isCCS()) { - return 0; - } - } - - return Device->SubDevices.size(); - }(); - - // TODO: Consider support for partitioning to <= total sub-devices. - // Currently supported partitioning (by affinity domain/numa) would always - // partition to all sub-devices. - // - if (NumDevices != 0) - UR_ASSERT(NumDevices == EffectiveNumDevices, UR_RESULT_ERROR_INVALID_VALUE); - - for (uint32_t I = 0; I < NumDevices; I++) { - auto prop = Properties->pProperties[0]; - if (prop.type == UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN) { - // In case the value is NEXT_PARTITIONABLE, we need to change it to the - // chosen domain. This will always be NUMA since that's the only domain - // supported by level zero. - prop.value.affinity_domain = UR_DEVICE_AFFINITY_DOMAIN_FLAG_NUMA; - } - Device->SubDevices[I]->SubDeviceCreationProperty = prop; - - OutDevices[I] = Device->SubDevices[I]; - // reusing the same pi_device needs to increment the reference count - urDeviceRetain(OutDevices[I]); - } - - if (NumDevicesRet) { - *NumDevicesRet = EffectiveNumDevices; - } - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( - ur_device_handle_t - Device, ///< [in] handle of the device to select binary for. - const ur_device_binary_t - *Binaries, ///< [in] the array of binaries to select from. - uint32_t NumBinaries, ///< [in] the number of binaries passed in ppBinaries. - ///< Must greater than or equal to zero otherwise - ///< ::UR_RESULT_ERROR_INVALID_VALUE is returned. - uint32_t - *SelectedBinary ///< [out] the index of the selected binary in the input - ///< array of binaries. If a suitable binary was not - ///< found the function returns ${X}_INVALID_BINARY. -) { - std::ignore = Device; - // TODO: this is a bare-bones implementation for choosing a device image - // that would be compatible with the targeted device. An AOT-compiled - // image is preferred over SPIR-V for known devices (i.e. Intel devices) - // The implementation makes no effort to differentiate between multiple images - // for the given device, and simply picks the first one compatible. - // - // Real implementation will use the same mechanism OpenCL ICD dispatcher - // uses. Something like: - // PI_VALIDATE_HANDLE_RETURN_HANDLE(ctx, PI_ERROR_INVALID_CONTEXT); - // return context->dispatch->piextDeviceSelectIR( - // ctx, images, num_images, selected_image); - // where context->dispatch is set to the dispatch table provided by PI - // plugin for platform/device the ctx was created for. - - // Look for GEN binary, which we known can only be handled by Level-Zero now. - const char *BinaryTarget = - UR_DEVICE_BINARY_TARGET_SPIRV64_GEN; // UR_DEVICE_BINARY_TARGET_SPIRV64_GEN; - - uint32_t *SelectedBinaryInd = SelectedBinary; - - // Find the appropriate device image, fallback to spirv if not found - constexpr uint32_t InvalidInd = (std::numeric_limits::max)(); - uint32_t Spirv = InvalidInd; - - for (uint32_t i = 0; i < NumBinaries; ++i) { - if (strcmp(Binaries[i].pDeviceTargetSpec, BinaryTarget) == 0) { - *SelectedBinaryInd = i; - return UR_RESULT_SUCCESS; - } - if (strcmp(Binaries[i].pDeviceTargetSpec, - UR_DEVICE_BINARY_TARGET_SPIRV64) == 0) - Spirv = i; - } - // Points to a spirv image, if such indeed was found - if ((*SelectedBinaryInd = Spirv) != InvalidInd) - return UR_RESULT_SUCCESS; - - // No image can be loaded for the given device - return UR_RESULT_ERROR_INVALID_BINARY; -} - -UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( - ur_device_handle_t Device, ///< [in] handle of the device. - ur_native_handle_t - *NativeDevice ///< [out] a pointer to the native handle of the device. -) { - *NativeDevice = reinterpret_cast(Device->ZeDevice); - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( - ur_native_handle_t NativeDevice, ///< [in] the native handle of the device. - ur_platform_handle_t Platform, ///< [in] handle of the platform instance - const ur_device_native_properties_t - *Properties, ///< [in][optional] pointer to native device properties - ///< struct. - ur_device_handle_t - *Device ///< [out] pointer to the handle of the device object created. -) { - std::ignore = Properties; - auto ZeDevice = ur_cast(NativeDevice); - - // The SYCL spec requires that the set of devices must remain fixed for the - // duration of the application's execution. We assume that we found all of the - // Level Zero devices when we initialized the platforms/devices cache, so the - // "NativeHandle" must already be in the cache. If it is not, this must not be - // a valid Level Zero device. - - ur_device_handle_t Dev = nullptr; - if (const auto *platforms = GlobalAdapter->PlatformCache->get_value()) { - for (const auto &p : *platforms) { - Dev = p->getDeviceFromNativeHandle(ZeDevice); - if (Dev) { - // Check that the input Platform, if was given, matches the found one. - UR_ASSERT(!Platform || Platform == p.get(), - UR_RESULT_ERROR_INVALID_PLATFORM); - break; - } - } - } else { - return GlobalAdapter->PlatformCache->get_error(); - } - - if (Dev == nullptr) - return UR_RESULT_ERROR_INVALID_VALUE; - - *Device = Dev; - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetGlobalTimestamps( - ur_device_handle_t Device, ///< [in] handle of the device instance - uint64_t *DeviceTimestamp, ///< [out][optional] pointer to the Device's - ///< global timestamp that correlates with the - ///< Host's global timestamp value - uint64_t *HostTimestamp ///< [out][optional] pointer to the Host's global - ///< timestamp that correlates with the Device's - ///< global timestamp value -) { - const uint64_t &ZeTimerResolution = - Device->ZeDeviceProperties->timerResolution; - const uint64_t TimestampMaxCount = Device->getTimestampMask(); - uint64_t DeviceClockCount, Dummy; - - ZE2UR_CALL(zeDeviceGetGlobalTimestamps, - (Device->ZeDevice, - HostTimestamp == nullptr ? &Dummy : HostTimestamp, - &DeviceClockCount)); - - if (DeviceTimestamp != nullptr) { - *DeviceTimestamp = - (DeviceClockCount & TimestampMaxCount) * ZeTimerResolution; - } - - return UR_RESULT_SUCCESS; -} diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp index 4672a0a4f6..a8b8098819 100644 --- a/source/adapters/level_zero/device.hpp +++ b/source/adapters/level_zero/device.hpp @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include @@ -61,7 +61,7 @@ struct ur_device_handle_t_ : _ur_object { ur_device_handle_t_(ze_device_handle_t Device, ur_platform_handle_t Plt, ur_device_handle_t ParentDevice = nullptr) : ZeDevice{Device}, Platform{Plt}, RootDevice{ParentDevice}, - ZeDeviceProperties{}, ZeDeviceComputeProperties{} { + ZeDeviceProperties{}, ZeDeviceComputeProperties{}, Id(std::nullopt) { // NOTE: one must additionally call initialize() to complete // UR device creation. } @@ -189,6 +189,9 @@ struct ur_device_handle_t_ : _ur_object { (ZeDeviceProperties->deviceId & 0xff0) == 0xb60; } + // Checks if this GPU is an Intel Flex GPU or Intel Arc Alchemist + bool isDG2() { return (ZeDeviceProperties->deviceId & 0xff00) == 0x5600; } + bool isIntegrated() { return (ZeDeviceProperties->flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED); } @@ -221,6 +224,10 @@ struct ur_device_handle_t_ : _ur_object { ZeCache> ZeDeviceMutableCmdListsProperties; + // Map device bindless image offset to corresponding host image handle. + std::unordered_map + ZeOffsetToImageHandleMap; + // unique ephemeral identifer of the device in the adapter - DeviceId Id; + std::optional Id; }; diff --git a/source/adapters/level_zero/enqueue_native.cpp b/source/adapters/level_zero/enqueue_native.cpp index b67cccc4f1..7c3a1da988 100644 --- a/source/adapters/level_zero/enqueue_native.cpp +++ b/source/adapters/level_zero/enqueue_native.cpp @@ -8,13 +8,30 @@ // //===----------------------------------------------------------------------===// +#include #include +#include -#include "queue.hpp" +namespace ur::level_zero { + +ur_result_t urEnqueueNativeCommandExp( + ur_queue_handle_t hQueue, + ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data, + uint32_t numMemsInMemList, const ur_mem_handle_t *phMemList, + const ur_exp_enqueue_native_command_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + std::ignore = hQueue; + std::ignore = pfnNativeEnqueue; + std::ignore = data; + std::ignore = numMemsInMemList; + std::ignore = phMemList; + std::ignore = pProperties; + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; -ur_result_t ur_queue_handle_legacy_t_::enqueueNativeCommandExp( - ur_exp_enqueue_native_command_function_t, void *, uint32_t, - const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *, - uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +} // namespace ur::level_zero diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index 33495f52b8..2bd3011b4b 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -18,6 +18,7 @@ #include "common.hpp" #include "event.hpp" #include "logger/ur_logger.hpp" +#include "ur_interface_loader.hpp" #include "ur_level_zero.hpp" void printZeEventList(const _ur_ze_event_list_t &UrZeEventList) { @@ -46,21 +47,23 @@ static const bool UseMultipleCmdlistBarriers = [] { }(); bool WaitListEmptyOrAllEventsFromSameQueue( - ur_queue_handle_legacy_t Queue, uint32_t NumEventsInWaitList, + ur_queue_handle_t Queue, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList) { if (!NumEventsInWaitList) return true; for (uint32_t i = 0; i < NumEventsInWaitList; ++i) { - if (Queue != Legacy(EventWaitList[i]->UrQueue)) + if (Queue != EventWaitList[i]->UrQueue) return false; } return true; } -ur_result_t ur_queue_handle_legacy_t_::enqueueEventsWait( ///< [in] handle of - ///< the queue object +namespace ur::level_zero { + +ur_result_t urEnqueueEventsWait( + ur_queue_handle_t Queue, ///< [in] handle of the queue object uint32_t NumEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] @@ -72,7 +75,6 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueEventsWait( ///< [in] handle of *OutEvent ///< [in,out][optional] return an event object that identifies ///< this particular command instance. ) { - auto Queue = this; if (EventWaitList) { bool UseCopyEngine = false; @@ -152,9 +154,8 @@ static const bool InOrderBarrierBySignal = [] { return (UrRet ? std::atoi(UrRet) : true); }(); -ur_result_t -ur_queue_handle_legacy_t_::enqueueEventsWaitWithBarrier( ///< [in] handle of the - ///< queue object +ur_result_t urEnqueueEventsWaitWithBarrier( + ur_queue_handle_t Queue, ///< [in] handle of the queue object uint32_t NumEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] @@ -166,52 +167,66 @@ ur_queue_handle_legacy_t_::enqueueEventsWaitWithBarrier( ///< [in] handle of the *OutEvent ///< [in,out][optional] return an event object that identifies ///< this particular command instance. ) { - auto Queue = this; - // Lock automatically releases when this goes out of scope. std::scoped_lock lock(Queue->Mutex); // Helper function for appending a barrier to a command list. - auto insertBarrierIntoCmdList = - [&Queue](ur_command_list_ptr_t CmdList, - const _ur_ze_event_list_t &EventWaitList, - ur_event_handle_t &Event, bool IsInternal) { - UR_CALL(createEventAndAssociateQueue( - Queue, &Event, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, CmdList, - IsInternal, false)); - - Event->WaitList = EventWaitList; - - // For in-order queue we don't need a real barrier, just wait for - // requested events in potentially different queues and add a "barrier" - // event signal because it is already guaranteed that previous commands - // in this queue are completed when the signal is started. - // - // Only consideration here is that when profiling is used, signalEvent - // cannot be used if EventWaitList.Lenght == 0. In those cases, we need - // to fallback directly to barrier to have correct timestamps. See here: - // https://spec.oneapi.io/level-zero/latest/core/api.html?highlight=appendsignalevent#_CPPv430zeCommandListAppendSignalEvent24ze_command_list_handle_t17ze_event_handle_t - // - // TODO: this and other special handling of in-order queues to be - // updated when/if Level Zero adds native support for in-order queues. - // - if (Queue->isInOrderQueue() && InOrderBarrierBySignal && - !Queue->isProfilingEnabled()) { - if (EventWaitList.Length) { - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (CmdList->first, EventWaitList.Length, - EventWaitList.ZeEventList)); + auto insertBarrierIntoCmdList = [&Queue](ur_command_list_ptr_t CmdList, + _ur_ze_event_list_t &EventWaitList, + ur_event_handle_t &Event, + bool IsInternal) { + UR_CALL(createEventAndAssociateQueue(Queue, &Event, + UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, + CmdList, IsInternal, false)); + + Event->WaitList = EventWaitList; + + // For in-order queue we don't need a real barrier, just wait for + // requested events in potentially different queues and add a "barrier" + // event signal because it is already guaranteed that previous commands + // in this queue are completed when the signal is started. + // + // Only consideration here is that when profiling is used, signalEvent + // cannot be used if EventWaitList.Lenght == 0. In those cases, we need + // to fallback directly to barrier to have correct timestamps. See here: + // https://spec.oneapi.io/level-zero/latest/core/api.html?highlight=appendsignalevent#_CPPv430zeCommandListAppendSignalEvent24ze_command_list_handle_t17ze_event_handle_t + // + // TODO: this and other special handling of in-order queues to be + // updated when/if Level Zero adds native support for in-order queues. + // + if (Queue->isInOrderQueue() && InOrderBarrierBySignal && + !Queue->isProfilingEnabled()) { + if (EventWaitList.Length) { + if (CmdList->second.IsInOrderList) { + for (unsigned i = EventWaitList.Length; i-- > 0;) { + // If the event is a multidevice event, then given driver in order + // lists, we cannot include this into the wait event list due to + // driver limitations. + if (EventWaitList.UrEventList[i]->IsMultiDevice) { + EventWaitList.Length--; + if (EventWaitList.Length != i) { + std::swap(EventWaitList.UrEventList[i], + EventWaitList.UrEventList[EventWaitList.Length]); + std::swap(EventWaitList.ZeEventList[i], + EventWaitList.ZeEventList[EventWaitList.Length]); + } + } } - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (CmdList->first, Event->ZeEvent)); - } else { - ZE2UR_CALL(zeCommandListAppendBarrier, - (CmdList->first, Event->ZeEvent, EventWaitList.Length, - EventWaitList.ZeEventList)); } + ZE2UR_CALL( + zeCommandListAppendWaitOnEvents, + (CmdList->first, EventWaitList.Length, EventWaitList.ZeEventList)); + } + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (CmdList->first, Event->ZeEvent)); + } else { + ZE2UR_CALL(zeCommandListAppendBarrier, + (CmdList->first, Event->ZeEvent, EventWaitList.Length, + EventWaitList.ZeEventList)); + } - return UR_RESULT_SUCCESS; - }; + return UR_RESULT_SUCCESS; + }; // If the queue is in-order then each command in it effectively acts as a // barrier, so we don't need to do anything except if we were requested @@ -222,18 +237,22 @@ ur_queue_handle_legacy_t_::enqueueEventsWaitWithBarrier( ///< [in] handle of the return UR_RESULT_SUCCESS; } - ur_event_handle_t InternalEvent; + ur_event_handle_t ResultEvent = nullptr; bool IsInternal = OutEvent == nullptr; - ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; // For in-order queue and wait-list which is empty or has events from // the same queue just use the last command event as the barrier event. - if (Queue->isInOrderQueue() && + // This optimization is disabled when profiling is enabled to ensure + // accurate profiling values & the overhead that profiling incurs. + if (Queue->isInOrderQueue() && !Queue->isProfilingEnabled() && WaitListEmptyOrAllEventsFromSameQueue(Queue, NumEventsInWaitList, EventWaitList) && Queue->LastCommandEvent && !Queue->LastCommandEvent->IsDiscarded) { - UR_CALL(urEventRetain(Queue->LastCommandEvent)); - *Event = Queue->LastCommandEvent; + UR_CALL(ur::level_zero::urEventRetain(Queue->LastCommandEvent)); + ResultEvent = Queue->LastCommandEvent; + if (OutEvent) { + *OutEvent = ResultEvent; + } return UR_RESULT_SUCCESS; } @@ -263,16 +282,21 @@ ur_queue_handle_legacy_t_::enqueueEventsWaitWithBarrier( ///< [in] handle of the EventWaitList, OkToBatch)); // Insert the barrier into the command-list and execute. - UR_CALL(insertBarrierIntoCmdList(CmdList, TmpWaitList, *Event, IsInternal)); + UR_CALL(insertBarrierIntoCmdList(CmdList, TmpWaitList, ResultEvent, + IsInternal)); UR_CALL(Queue->executeCommandList(CmdList, false, OkToBatch)); // Because of the dependency between commands in the in-order queue we don't // need to keep track of any active barriers if we have in-order queue. if (UseMultipleCmdlistBarriers && !Queue->isInOrderQueue()) { - auto UREvent = reinterpret_cast(*Event); + auto UREvent = reinterpret_cast(ResultEvent); Queue->ActiveBarriers.add(UREvent); } + + if (OutEvent) { + *OutEvent = ResultEvent; + } return UR_RESULT_SUCCESS; } @@ -302,8 +326,8 @@ ur_queue_handle_legacy_t_::enqueueEventsWaitWithBarrier( ///< [in] handle of the for (auto &QueueMap : {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID}) for (auto &QueueGroup : QueueMap) { - bool UseCopyEngine = QueueGroup.second.Type != - ur_queue_handle_legacy_t_::queue_type::Compute; + bool UseCopyEngine = + QueueGroup.second.Type != ur_queue_handle_t_::queue_type::Compute; if (Queue->UsingImmCmdLists) { // If immediate command lists are being used, each will act as their own // queue, so we must insert a barrier into each. @@ -339,9 +363,9 @@ ur_queue_handle_legacy_t_::enqueueEventsWaitWithBarrier( ///< [in] handle of the // command-lists. std::vector EventWaitVector(CmdLists.size()); for (size_t I = 0; I < CmdLists.size(); ++I) { - UR_CALL(insertBarrierIntoCmdList(CmdLists[I], _ur_ze_event_list_t{}, - EventWaitVector[I], - true /*IsInternal*/)); + _ur_ze_event_list_t waitlist; + UR_CALL(insertBarrierIntoCmdList( + CmdLists[I], waitlist, EventWaitVector[I], true /*IsInternal*/)); } // If there were multiple queues we need to create a "convergence" event to // be our active barrier. This convergence event is signalled by a barrier @@ -360,20 +384,21 @@ ur_queue_handle_legacy_t_::enqueueEventsWaitWithBarrier( ///< [in] handle of the // Insert a barrier with the events from each command-queue into the // convergence command list. The resulting event signals the convergence of // all barriers. - UR_CALL(insertBarrierIntoCmdList(ConvergenceCmdList, BaseWaitList, *Event, - IsInternal)); + UR_CALL(insertBarrierIntoCmdList(ConvergenceCmdList, BaseWaitList, + ResultEvent, IsInternal)); } else { // If there is only a single queue then insert a barrier and the single // result event can be used as our active barrier and used as the return // event. Take into account whether output event is discarded or not. - UR_CALL(insertBarrierIntoCmdList(CmdLists[0], _ur_ze_event_list_t{}, *Event, + _ur_ze_event_list_t waitlist; + UR_CALL(insertBarrierIntoCmdList(CmdLists[0], waitlist, ResultEvent, IsInternal)); } // Execute each command list so the barriers can be encountered. for (ur_command_list_ptr_t &CmdList : CmdLists) { - bool IsCopy = CmdList->second.isCopy( - reinterpret_cast(Queue)); + bool IsCopy = + CmdList->second.isCopy(reinterpret_cast(Queue)); const auto &CommandBatch = (IsCopy) ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch; // Only batch if the matching CmdList is already open. @@ -383,12 +408,14 @@ ur_queue_handle_legacy_t_::enqueueEventsWaitWithBarrier( ///< [in] handle of the } UR_CALL(Queue->ActiveBarriers.clear()); - auto UREvent = reinterpret_cast(*Event); - Queue->ActiveBarriers.add(UREvent); + Queue->ActiveBarriers.add(ResultEvent); + if (OutEvent) { + *OutEvent = ResultEvent; + } return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo( +ur_result_t urEventGetInfo( ur_event_handle_t Event, ///< [in] handle of the event object ur_event_info_t PropName, ///< [in] the name of the event property to query size_t PropValueSize, ///< [in] size in bytes of the event property value @@ -417,7 +444,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo( // possible that this is trying to query some event's status that // is part of the batch. This isn't strictly required, but it seems // like a reasonable thing to do. - auto UrQueue = Legacy(Event->UrQueue); + auto UrQueue = Event->UrQueue; if (UrQueue) { // Lock automatically releases when this goes out of scope. std::unique_lock Lock(UrQueue->Mutex, std::try_to_lock); @@ -471,7 +498,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( +ur_result_t urEventGetProfilingInfo( ur_event_handle_t Event, ///< [in] handle of the event object ur_profiling_info_t PropName, ///< [in] the name of the profiling property to query @@ -489,9 +516,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; } - ur_device_handle_t Device = Legacy(Event->UrQueue) - ? Legacy(Event->UrQueue)->Device - : Event->Context->Devices[0]; + ur_device_handle_t Device = + Event->UrQueue ? Event->UrQueue->Device : Event->Context->Devices[0]; uint64_t ZeTimerResolution = Device->ZeDeviceProperties->timerResolution; const uint64_t TimestampMaxValue = Device->getTimestampMask(); @@ -515,10 +541,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( return ReturnValue(Event->RecordEventEndTimestamp); // Otherwise we need to collect it from the queue. - auto Entry = Legacy(Event->UrQueue)->EndTimeRecordings.find(Event); + auto Entry = Event->UrQueue->EndTimeRecordings.find(Event); // Unexpected state if there is no end-time record. - if (Entry == Legacy(Event->UrQueue)->EndTimeRecordings.end()) + if (Entry == Event->UrQueue->EndTimeRecordings.end()) return UR_RESULT_ERROR_UNKNOWN; auto &EndTimeRecording = Entry->second; @@ -543,7 +569,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( // anymore, so we cache it on the event and evict the record from the // queue. Event->RecordEventEndTimestamp = ContextEndTime; - Legacy(Event->UrQueue)->EndTimeRecordings.erase(Entry); + Event->UrQueue->EndTimeRecordings.erase(Entry); return ReturnValue(ContextEndTime); } @@ -661,7 +687,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_handle_legacy_t_::enqueueTimestampRecordingExp( +ur_result_t urEnqueueTimestampRecordingExp( + ur_queue_handle_t Queue, ///< [in] handle of the queue object bool Blocking, ///< [in] blocking or non-blocking enqueue uint32_t NumEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t @@ -675,7 +702,6 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueTimestampRecordingExp( *OutEvent ///< [in,out] return an event object that identifies ///< this particular command instance. ) { - auto Queue = this; // Lock automatically releases when this goes out of scope. std::scoped_lock lock(Queue->Mutex); @@ -699,12 +725,13 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueTimestampRecordingExp( (*OutEvent)->WaitList = TmpWaitList; uint64_t DeviceStartTimestamp = 0; - UR_CALL(urDeviceGetGlobalTimestamps(Device, &DeviceStartTimestamp, nullptr)); + UR_CALL(ur::level_zero::urDeviceGetGlobalTimestamps( + Device, &DeviceStartTimestamp, nullptr)); (*OutEvent)->RecordEventStartTimestamp = DeviceStartTimestamp; // Create a new entry in the queue's recordings. Queue->EndTimeRecordings[*OutEvent] = - ur_queue_handle_legacy_t_::end_time_recording{}; + ur_queue_handle_t_::end_time_recording{}; ZE2UR_CALL(zeCommandListAppendWriteGlobalTimestamp, (CommandList->first, @@ -718,64 +745,15 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueTimestampRecordingExp( return UR_RESULT_SUCCESS; } -ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent( - ze_event_handle_t &ZeHostVisibleEvent) { - auto UrQueue = Legacy(this->UrQueue); - - std::scoped_lock Lock(UrQueue->Mutex, - this->Mutex); - - if (!HostVisibleEvent) { - this->IsCreatingHostProxyEvent = true; - if (UrQueue->ZeEventsScope != OnDemandHostVisibleProxy) - die("getOrCreateHostVisibleEvent: missing host-visible event"); - - // Submit the command(s) signalling the proxy event to the queue. - // We have to first submit a wait for the device-only event for which this - // proxy is created. - // - // Get a new command list to be used on this call - - // We want to batch these commands to avoid extra submissions (costly) - bool OkToBatch = true; - - ur_command_list_ptr_t CommandList{}; - UR_CALL(UrQueue->Context->getAvailableCommandList( - UrQueue, CommandList, false /* UseCopyEngine */, 0, nullptr, OkToBatch)) - - // Create a "proxy" host-visible event. - UR_CALL(createEventAndAssociateQueue( - UrQueue, &HostVisibleEvent, UR_EXT_COMMAND_TYPE_USER, CommandList, - /* IsInternal */ false, /* IsMultiDevice */ false, - /* HostVisible */ true)); - - if (this->IsInnerBatchedEvent) { - ZE2UR_CALL(zeCommandListAppendBarrier, - (CommandList->first, ZeEvent, 0, nullptr)); - } else { - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (CommandList->first, 1, &ZeEvent)); - } - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (CommandList->first, HostVisibleEvent->ZeEvent)); - - UR_CALL(UrQueue->executeCommandList(CommandList, false, OkToBatch)) - this->IsCreatingHostProxyEvent = false; - } - - ZeHostVisibleEvent = HostVisibleEvent->ZeEvent; - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urEventWait( - uint32_t NumEvents, ///< [in] number of events in the event list - const ur_event_handle_t - *EventWaitList ///< [in][range(0, numEvents)] pointer to a list of - ///< events to wait for completion +ur_result_t +urEventWait(uint32_t NumEvents, ///< [in] number of events in the event list + const ur_event_handle_t + *EventWaitList ///< [in][range(0, numEvents)] pointer to a list + ///< of events to wait for completion ) { for (uint32_t I = 0; I < NumEvents; I++) { auto e = EventWaitList[I]; - auto UrQueue = Legacy(e->UrQueue); + auto UrQueue = e->UrQueue; if (UrQueue && UrQueue->ZeEventsScope == OnDemandHostVisibleProxy) { // Make sure to add all host-visible "proxy" event signals if needed. // This ensures that all signalling commands are submitted below and @@ -793,7 +771,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventWait( // Submit dependent open command lists for execution, if any for (uint32_t I = 0; I < NumEvents; I++) { ur_event_handle_t_ *Event = ur_cast(EventWaitList[I]); - auto UrQueue = Legacy(Event->UrQueue); + auto UrQueue = Event->UrQueue; if (UrQueue) { // Lock automatically releases when this goes out of scope. std::scoped_lock lock(UrQueue->Mutex); @@ -801,7 +779,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventWait( UR_CALL(UrQueue->executeAllOpenCommandLists()); } } - std::unordered_set Queues; + std::unordered_set Queues; for (uint32_t I = 0; I < NumEvents; I++) { { ur_event_handle_t_ *Event = @@ -828,13 +806,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventWait( Event->Completed = true; } } - if (auto Q = Legacy(Event->UrQueue)) { + if (auto Q = Event->UrQueue) { if (Q->UsingImmCmdLists && Q->isInOrderQueue()) // Use information about waited event to cleanup completed events in // the in-order queue. CleanupEventsInImmCmdLists( - Legacy(Event->UrQueue), false /* QueueLocked */, - false /* QueueSynced */, + Event->UrQueue, false /* QueueLocked */, false /* QueueSynced */, reinterpret_cast(Event)); else { // NOTE: we are cleaning up after the event here to free resources @@ -859,8 +836,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventWait( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urEventRetain( - ur_event_handle_t Event ///< [in] handle of the event object +ur_result_t +urEventRetain(ur_event_handle_t Event ///< [in] handle of the event object ) { Event->RefCountExternal++; Event->RefCount.increment(); @@ -868,8 +845,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRetain( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urEventRelease( - ur_event_handle_t Event ///< [in] handle of the event object +ur_result_t +urEventRelease(ur_event_handle_t Event ///< [in] handle of the event object ) { Event->RefCountExternal--; UR_CALL(urEventReleaseInternal(Event)); @@ -877,7 +854,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urEventGetNativeHandle( +ur_result_t urEventGetNativeHandle( ur_event_handle_t Event, ///< [in] handle of the event. ur_native_handle_t *NativeEvent ///< [out] a pointer to the native handle of the event. @@ -890,7 +867,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetNativeHandle( // Event can potentially be in an open command-list, make sure that // it is submitted for execution to avoid potential deadlock if // interop app is going to wait for it. - auto Queue = Legacy(Event->UrQueue); + auto Queue = Event->UrQueue; if (Queue) { std::scoped_lock lock(Queue->Mutex); const auto &OpenCommandList = Queue->eventOpenCommandList(Event); @@ -902,7 +879,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetNativeHandle( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urExtEventCreate( +ur_result_t urExtEventCreate( ur_context_handle_t Context, ///< [in] handle of the context object ur_event_handle_t *Event ///< [out] pointer to the handle of the event object created. @@ -915,7 +892,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urExtEventCreate( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle( +ur_result_t urEventCreateWithNativeHandle( ur_native_handle_t NativeEvent, ///< [in] the native handle of the event. ur_context_handle_t Context, ///< [in] handle of the context object const ur_event_native_properties_t *Properties, @@ -965,7 +942,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback( +ur_result_t urEventSetCallback( ur_event_handle_t Event, ///< [in] handle of the event object ur_execution_info_t ExecStatus, ///< [in] execution status of the event ur_event_callback_t Notify, ///< [in] execution status of the event @@ -981,6 +958,57 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +} // namespace ur::level_zero + +ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent( + ze_event_handle_t &ZeHostVisibleEvent) { + auto UrQueue = this->UrQueue; + + std::scoped_lock Lock(UrQueue->Mutex, + this->Mutex); + + if (!HostVisibleEvent) { + this->IsCreatingHostProxyEvent = true; + if (UrQueue->ZeEventsScope != OnDemandHostVisibleProxy) + die("getOrCreateHostVisibleEvent: missing host-visible event"); + + // Submit the command(s) signalling the proxy event to the queue. + // We have to first submit a wait for the device-only event for which this + // proxy is created. + // + // Get a new command list to be used on this call + + // We want to batch these commands to avoid extra submissions (costly) + bool OkToBatch = true; + + ur_command_list_ptr_t CommandList{}; + UR_CALL(UrQueue->Context->getAvailableCommandList( + UrQueue, CommandList, false /* UseCopyEngine */, 0, nullptr, OkToBatch)) + + // Create a "proxy" host-visible event. + UR_CALL(createEventAndAssociateQueue( + UrQueue, &HostVisibleEvent, UR_EXT_COMMAND_TYPE_USER, CommandList, + /* IsInternal */ false, /* IsMultiDevice */ false, + /* HostVisible */ true)); + + if (this->IsInnerBatchedEvent) { + ZE2UR_CALL(zeCommandListAppendBarrier, + (CommandList->first, ZeEvent, 0, nullptr)); + } else { + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (CommandList->first, 1, &ZeEvent)); + } + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (CommandList->first, HostVisibleEvent->ZeEvent)); + + UR_CALL(UrQueue->executeCommandList(CommandList, false, OkToBatch)) + this->IsCreatingHostProxyEvent = false; + } + + ZeHostVisibleEvent = HostVisibleEvent->ZeEvent; + return UR_RESULT_SUCCESS; +} + ur_result_t urEventReleaseInternal(ur_event_handle_t Event) { if (!Event->RefCount.decrementAndTest()) return UR_RESULT_SUCCESS; @@ -1020,7 +1048,7 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) { } // Save pointer to the queue before deleting/resetting event. - auto Queue = Legacy(Event->UrQueue); + auto Queue = Event->UrQueue; // If the event was a timestamp recording, we try to evict its entry in the // queue. @@ -1034,7 +1062,7 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) { EndTimeRecording.EventHasDied = true; } else { // Otherwise we evict the entry. - Legacy(Event->UrQueue)->EndTimeRecordings.erase(Entry); + Queue->EndTimeRecordings.erase(Entry); } } } @@ -1097,7 +1125,7 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked, ur_kernel_handle_t AssociatedKernel = nullptr; // List of dependent events. std::list EventsToBeReleased; - ur_queue_handle_legacy_t AssociatedQueue = nullptr; + ur_queue_handle_t AssociatedQueue = nullptr; { // If the Event is already locked, then continue with the cleanup, otherwise // block on locking the event. @@ -1111,7 +1139,7 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked, if (Event->CleanedUp) return UR_RESULT_SUCCESS; - AssociatedQueue = Legacy(Event->UrQueue); + AssociatedQueue = Event->UrQueue; // Remember the kernel associated with this event if there is one. We are // going to release it later. @@ -1156,7 +1184,7 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked, // We've reset event data members above, now cleanup resources. if (AssociatedKernel) { ReleaseIndirectMem(AssociatedKernel); - UR_CALL(urKernelRelease(AssociatedKernel)); + UR_CALL(ur::level_zero::urKernelRelease(AssociatedKernel)); } if (AssociatedQueue) { @@ -1215,7 +1243,7 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked, } if (DepEventKernel) { ReleaseIndirectMem(DepEventKernel); - UR_CALL(urKernelRelease(DepEventKernel)); + UR_CALL(ur::level_zero::urKernelRelease(DepEventKernel)); } UR_CALL(urEventReleaseInternal(DepEvent)); } @@ -1228,9 +1256,9 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked, // The "HostVisible" argument specifies if event needs to be allocated from // a host-visible pool. // -ur_result_t EventCreate(ur_context_handle_t Context, - ur_queue_handle_legacy_t Queue, bool IsMultiDevice, - bool HostVisible, ur_event_handle_t *RetEvent, +ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, + bool IsMultiDevice, bool HostVisible, + ur_event_handle_t *RetEvent, bool CounterBasedEventEnabled, bool ForceDisableProfiling) { bool ProfilingEnabled = @@ -1317,7 +1345,7 @@ ur_result_t ur_event_handle_t_::reset() { ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( uint32_t EventListLength, const ur_event_handle_t *EventList, - ur_queue_handle_legacy_t CurQueue, bool UseCopyEngine) { + ur_queue_handle_t CurQueue, bool UseCopyEngine) { this->Length = 0; this->ZeEventList = nullptr; this->UrEventList = nullptr; @@ -1433,7 +1461,7 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( } } - auto Queue = Legacy(EventList[I]->UrQueue); + auto Queue = EventList[I]->UrQueue; auto CurQueueDevice = CurQueue->Device; std::optional> QueueLock = @@ -1506,8 +1534,8 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( std::shared_lock Lock(EventList[I]->Mutex); - ur_device_handle_t QueueRootDevice; - ur_device_handle_t CurrentQueueRootDevice; + ur_device_handle_t QueueRootDevice = nullptr; + ur_device_handle_t CurrentQueueRootDevice = nullptr; if (Queue) { QueueRootDevice = Queue->Device; CurrentQueueRootDevice = CurQueueDevice; @@ -1533,11 +1561,11 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( const auto &ZeCommandList = CommandList->first; EventList[I]->RefCount.increment(); - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (ZeCommandList, 1u, &EventList[I]->ZeEvent)); - if (!MultiDeviceEvent->CounterBasedEventsEnabled) - ZE2UR_CALL(zeEventHostSignal, (MultiDeviceZeEvent)); - + // Append a Barrier to wait on the original event while signalling the + // new multi device event. + ZE2UR_CALL( + zeCommandListAppendBarrier, + (ZeCommandList, MultiDeviceZeEvent, 1u, &EventList[I]->ZeEvent)); UR_CALL(Queue->executeCommandList(CommandList, /* IsBlocking */ false, /* OkToBatchCommand */ true)); @@ -1634,7 +1662,7 @@ ur_result_t _ur_ze_event_list_t::collectEventsForReleaseAndDestroyUrZeEventList( // Tells if this event is with profiling capabilities. bool ur_event_handle_t_::isProfilingEnabled() const { return !UrQueue || // tentatively assume user events are profiling enabled - (Legacy(UrQueue)->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0; + (UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0; } // Tells if this event was created as a timestamp event, allowing profiling diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp index e99df2a272..7dd64acdaa 100644 --- a/source/adapters/level_zero/event.hpp +++ b/source/adapters/level_zero/event.hpp @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include @@ -29,9 +29,9 @@ extern "C" { ur_result_t urEventReleaseInternal(ur_event_handle_t Event); -ur_result_t EventCreate(ur_context_handle_t Context, - ur_queue_handle_legacy_t Queue, bool IsMultiDevice, - bool HostVisible, ur_event_handle_t *RetEvent, +ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, + bool IsMultiDevice, bool HostVisible, + ur_event_handle_t *RetEvent, bool CounterBasedEventEnabled = false, bool ForceDisableProfiling = false); } // extern "C" @@ -89,7 +89,7 @@ struct _ur_ze_event_list_t { // command-lists. ur_result_t createAndRetainUrZeEventList(uint32_t EventListLength, const ur_event_handle_t *EventList, - ur_queue_handle_legacy_t CurQueue, + ur_queue_handle_t CurQueue, bool UseCopyEngine); // Add all the events in this object's UrEventList to the end diff --git a/source/adapters/level_zero/helpers/kernel_helpers.cpp b/source/adapters/level_zero/helpers/kernel_helpers.cpp new file mode 100644 index 0000000000..d043d59ce1 --- /dev/null +++ b/source/adapters/level_zero/helpers/kernel_helpers.cpp @@ -0,0 +1,157 @@ +//===--------- kernel_helpers.cpp - Level Zero Adapter -------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "kernel_helpers.hpp" +#include "logger/ur_logger.hpp" + +#include "../common.hpp" +#include "../device.hpp" + +#ifdef UR_ADAPTER_LEVEL_ZERO_V2 +#include "../v2/context.hpp" +#else +#include "../context.hpp" +#endif + +ur_result_t getSuggestedLocalWorkSize(ur_device_handle_t hDevice, + ze_kernel_handle_t hZeKernel, + size_t GlobalWorkSize3D[3], + uint32_t SuggestedLocalWorkSize3D[3]) { + uint32_t *WG = SuggestedLocalWorkSize3D; + + // We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize + // values do not fit to 32-bit that the API only supports currently. + bool SuggestGroupSize = true; + for (int I : {0, 1, 2}) { + if (GlobalWorkSize3D[I] > UINT32_MAX) { + SuggestGroupSize = false; + } + } + if (SuggestGroupSize) { + ZE2UR_CALL(zeKernelSuggestGroupSize, + (hZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1], + GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2])); + } else { + for (int I : {0, 1, 2}) { + // Try to find a I-dimension WG size that the GlobalWorkSize[I] is + // fully divisable with. Start with the max possible size in + // each dimension. + uint32_t GroupSize[] = { + hDevice->ZeDeviceComputeProperties->maxGroupSizeX, + hDevice->ZeDeviceComputeProperties->maxGroupSizeY, + hDevice->ZeDeviceComputeProperties->maxGroupSizeZ}; + GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]); + while (GlobalWorkSize3D[I] % GroupSize[I]) { + --GroupSize[I]; + } + if (GlobalWorkSize3D[I] / GroupSize[I] > UINT32_MAX) { + logger::error("getSuggestedLocalWorkSize: can't find a WG size " + "suitable for global work size > UINT32_MAX"); + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + WG[I] = GroupSize[I]; + } + logger::debug( + "getSuggestedLocalWorkSize: using computed WG size = {{{}, {}, {}}}", + WG[0], WG[1], WG[2]); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t setKernelGlobalOffset(ur_context_handle_t Context, + ze_kernel_handle_t Kernel, + const size_t *GlobalWorkOffset) { + if (!Context->getPlatform()->ZeDriverGlobalOffsetExtensionFound) { + logger::debug("No global offset extension found on this driver"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + ZE2UR_CALL( + zeKernelSetGlobalOffsetExp, + (Kernel, GlobalWorkOffset[0], GlobalWorkOffset[1], GlobalWorkOffset[2])); + + return UR_RESULT_SUCCESS; +} + +ur_result_t calculateKernelWorkDimensions( + ze_kernel_handle_t Kernel, ur_device_handle_t Device, + ze_group_count_t &ZeThreadGroupDimensions, uint32_t (&WG)[3], + uint32_t WorkDim, const size_t *GlobalWorkSize, + const size_t *LocalWorkSize) { + + UR_ASSERT(GlobalWorkSize, UR_RESULT_ERROR_INVALID_VALUE); + // If LocalWorkSize is not provided then Kernel must be provided to query + // suggested group size. + UR_ASSERT(LocalWorkSize || Kernel, UR_RESULT_ERROR_INVALID_VALUE); + + // New variable needed because GlobalWorkSize parameter might not be of size + // 3 + size_t GlobalWorkSize3D[3]{1, 1, 1}; + std::copy(GlobalWorkSize, GlobalWorkSize + WorkDim, GlobalWorkSize3D); + + if (LocalWorkSize) { + WG[0] = ur_cast(LocalWorkSize[0]); + WG[1] = WorkDim >= 2 ? ur_cast(LocalWorkSize[1]) : 1; + WG[2] = WorkDim == 3 ? ur_cast(LocalWorkSize[2]) : 1; + } else { + UR_CALL(getSuggestedLocalWorkSize(Device, Kernel, GlobalWorkSize3D, WG)); + } + + // TODO: assert if sizes do not fit into 32-bit? + switch (WorkDim) { + case 3: + ZeThreadGroupDimensions.groupCountX = + ur_cast(GlobalWorkSize3D[0] / WG[0]); + ZeThreadGroupDimensions.groupCountY = + ur_cast(GlobalWorkSize3D[1] / WG[1]); + ZeThreadGroupDimensions.groupCountZ = + ur_cast(GlobalWorkSize3D[2] / WG[2]); + break; + case 2: + ZeThreadGroupDimensions.groupCountX = + ur_cast(GlobalWorkSize3D[0] / WG[0]); + ZeThreadGroupDimensions.groupCountY = + ur_cast(GlobalWorkSize3D[1] / WG[1]); + WG[2] = 1; + break; + case 1: + ZeThreadGroupDimensions.groupCountX = + ur_cast(GlobalWorkSize3D[0] / WG[0]); + WG[1] = WG[2] = 1; + break; + + default: + logger::error("calculateKernelWorkDimensions: unsupported work_dim"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + // Error handling for non-uniform group size case + if (GlobalWorkSize3D[0] != + size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) { + logger::error("calculateKernelWorkDimensions: invalid work_dim. The range " + "is not a multiple of the group size in the 1st dimension"); + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + if (GlobalWorkSize3D[1] != + size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) { + logger::error("calculateKernelWorkDimensions: invalid work_dim. The range " + "is not a multiple of the group size in the 2nd dimension"); + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + if (GlobalWorkSize3D[2] != + size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) { + logger::error("calculateKernelWorkDimensions: invalid work_dim. The range " + "is not a multiple of the group size in the 3rd dimension"); + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + + return UR_RESULT_SUCCESS; +} diff --git a/source/adapters/level_zero/helpers/kernel_helpers.hpp b/source/adapters/level_zero/helpers/kernel_helpers.hpp new file mode 100644 index 0000000000..2eaced02d3 --- /dev/null +++ b/source/adapters/level_zero/helpers/kernel_helpers.hpp @@ -0,0 +1,57 @@ +//===--------- kernel_helpers.hpp - Level Zero Adapter -------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +/** + * Calculates a work group size for the kernel based on the GlobalWorkSize or + * the LocalWorkSize if provided. + * @param[in][optional] Kernel The Kernel. Used when LocalWorkSize is not + * provided. + * @param[in][optional] Device The device associated with the kernel. Used when + * LocalWorkSize is not provided. + * @param[out] ZeThreadGroupDimensions Number of work groups in each dimension. + * @param[out] WG The work group size for each dimension. + * @param[in] WorkDim The number of dimensions in the kernel. + * @param[in] GlobalWorkSize The global work size. + * @param[in][optional] LocalWorkSize The local work size. + * @return UR_RESULT_SUCCESS or an error code on failure. + */ +ur_result_t calculateKernelWorkDimensions( + ze_kernel_handle_t Kernel, ur_device_handle_t Device, + ze_group_count_t &ZeThreadGroupDimensions, uint32_t (&WG)[3], + uint32_t WorkDim, const size_t *GlobalWorkSize, + const size_t *LocalWorkSize); + +/** + * Sets the global offset for a kernel command that will be appended to the + * command buffer. + * @param[in] Context Context associated with the queue. + * @param[in] Kernel The handle to the kernel that will be appended. + * @param[in] GlobalWorkOffset The global offset value. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t setKernelGlobalOffset(ur_context_handle_t Context, + ze_kernel_handle_t Kernel, + const size_t *GlobalWorkOffset); + +/** + * Get the suggested local work size for a kernel. + * @param[in] hDevice The device associated with the kernel. + * @param[in] hZeKernel The kernel handle. + * @param[in] GlobalWorkSize3D The global work size. + * @param[out] SuggestedLocalWorkSize3D The suggested local work size. + * @return UR_RESULT_SUCCESS or an error code on failure. + */ +ur_result_t getSuggestedLocalWorkSize(ur_device_handle_t hDevice, + ze_kernel_handle_t hZeKernel, + size_t GlobalWorkSize3D[3], + uint32_t SuggestedLocalWorkSize3D[3]); diff --git a/source/adapters/level_zero/helpers/memory_helpers.cpp b/source/adapters/level_zero/helpers/memory_helpers.cpp new file mode 100644 index 0000000000..fb3150c3c8 --- /dev/null +++ b/source/adapters/level_zero/helpers/memory_helpers.cpp @@ -0,0 +1,73 @@ +//===--------- memory_helpers.cpp - Level Zero Adapter -------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "memory_helpers.hpp" +#include "../common.hpp" + +ze_memory_type_t getMemoryType(ze_context_handle_t hContext, void *ptr) { + // TODO: use UMF once + // https://github.com/oneapi-src/unified-memory-framework/issues/687 is + // implemented + ZeStruct zeMemoryAllocationProperties; + ZE2UR_CALL_THROWS(zeMemGetAllocProperties, + (hContext, ptr, &zeMemoryAllocationProperties, nullptr)); + return zeMemoryAllocationProperties.type; +} + +bool maybeImportUSM(ze_driver_handle_t hTranslatedDriver, + ze_context_handle_t hContext, void *ptr, size_t size) { + if (ZeUSMImport.Enabled && ptr != nullptr && + getMemoryType(hContext, ptr) == ZE_MEMORY_TYPE_UNKNOWN) { + // Promote the host ptr to USM host memory + ZeUSMImport.doZeUSMImport(hTranslatedDriver, ptr, size); + return true; + } + return false; +} + +ze_region_params ur2zeRegionParams(ur_rect_offset_t SrcOrigin, + ur_rect_offset_t DstOrigin, + ur_rect_region_t Region, size_t SrcRowPitch, + size_t DstRowPitch, size_t SrcSlicePitch, + size_t DstSlicePitch) { + uint32_t SrcOriginX = ur_cast(SrcOrigin.x); + uint32_t SrcOriginY = ur_cast(SrcOrigin.y); + uint32_t SrcOriginZ = ur_cast(SrcOrigin.z); + + uint32_t SrcPitch = SrcRowPitch; + if (SrcPitch == 0) + SrcPitch = ur_cast(Region.width); + + if (SrcSlicePitch == 0) + SrcSlicePitch = ur_cast(Region.height) * SrcPitch; + + uint32_t DstOriginX = ur_cast(DstOrigin.x); + uint32_t DstOriginY = ur_cast(DstOrigin.y); + uint32_t DstOriginZ = ur_cast(DstOrigin.z); + + uint32_t DstPitch = DstRowPitch; + if (DstPitch == 0) + DstPitch = ur_cast(Region.width); + + if (DstSlicePitch == 0) + DstSlicePitch = ur_cast(Region.height) * DstPitch; + + uint32_t Width = ur_cast(Region.width); + uint32_t Height = ur_cast(Region.height); + uint32_t Depth = ur_cast(Region.depth); + + const ze_copy_region_t ZeSrcRegion = {SrcOriginX, SrcOriginY, SrcOriginZ, + Width, Height, Depth}; + const ze_copy_region_t ZeDstRegion = {DstOriginX, DstOriginY, DstOriginZ, + Width, Height, Depth}; + + return ze_region_params{ZeDstRegion, DstPitch, DstSlicePitch, + ZeSrcRegion, SrcPitch, SrcSlicePitch}; +} diff --git a/source/adapters/level_zero/helpers/memory_helpers.hpp b/source/adapters/level_zero/helpers/memory_helpers.hpp new file mode 100644 index 0000000000..760580c69e --- /dev/null +++ b/source/adapters/level_zero/helpers/memory_helpers.hpp @@ -0,0 +1,39 @@ +//===--------- memory_helpers.hpp - Level Zero Adapter -------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include +#include + +// If USM Import feature is enabled and hostptr is supplied, +// import the hostptr if not already imported into USM. +// Data transfer rate is maximized when both source and destination +// are USM pointers. Promotion of the host pointer to USM thus +// optimizes data transfer performance. +bool maybeImportUSM(ze_driver_handle_t hTranslatedDriver, + ze_context_handle_t hContext, void *ptr, size_t size); + +ze_memory_type_t getMemoryType(ze_context_handle_t hContext, void *ptr); + +struct ze_region_params { + const ze_copy_region_t dstRegion; + size_t dstPitch; + size_t dstSlicePitch; + const ze_copy_region_t srcRegion; + size_t srcPitch; + size_t srcSlicePitch; +}; + +// Convert UR region parameters for zeCommandListAppendMemoryCopyRegion +ze_region_params ur2zeRegionParams(ur_rect_offset_t SrcOrigin, + ur_rect_offset_t DstOrigin, + ur_rect_region_t Region, size_t SrcRowPitch, + size_t DstRowPitch, size_t SrcSlicePitch, + size_t DstSlicePitch); diff --git a/source/adapters/level_zero/image.cpp b/source/adapters/level_zero/image.cpp index d408c748f6..4810b3d88e 100644 --- a/source/adapters/level_zero/image.cpp +++ b/source/adapters/level_zero/image.cpp @@ -14,7 +14,9 @@ #include "event.hpp" #include "logger/ur_logger.hpp" #include "sampler.hpp" +#include "ur_interface_loader.hpp" #include "ur_level_zero.hpp" +#include "ze_api.h" typedef ze_result_t(ZE_APICALL *zeImageGetDeviceOffsetExp_pfn)( ze_image_handle_t hImage, uint64_t *pDeviceOffset); @@ -37,16 +39,19 @@ ur_result_t ze2urImageFormat(const ze_image_desc_t *ZeImageDesc, switch (ZeImageFormat.layout) { case ZE_IMAGE_FORMAT_LAYOUT_8: case ZE_IMAGE_FORMAT_LAYOUT_8_8: + case ZE_IMAGE_FORMAT_LAYOUT_8_8_8: case ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8: ZeImageFormatTypeSize = 8; break; case ZE_IMAGE_FORMAT_LAYOUT_16: case ZE_IMAGE_FORMAT_LAYOUT_16_16: + case ZE_IMAGE_FORMAT_LAYOUT_16_16_16: case ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16: ZeImageFormatTypeSize = 16; break; case ZE_IMAGE_FORMAT_LAYOUT_32: case ZE_IMAGE_FORMAT_LAYOUT_32_32: + case ZE_IMAGE_FORMAT_LAYOUT_32_32_32: case ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32: ZeImageFormatTypeSize = 32; break; @@ -98,7 +103,30 @@ ur_result_t ze2urImageFormat(const ze_image_desc_t *ZeImageDesc, default: logger::error( "ze2urImageFormat: unexpected image format channel y: y = {}\n", - ZeImageFormat.x); + ZeImageFormat.y); + return UR_RESULT_ERROR_INVALID_VALUE; + } + break; + case ZE_IMAGE_FORMAT_LAYOUT_8_8_8: + case ZE_IMAGE_FORMAT_LAYOUT_16_16_16: + case ZE_IMAGE_FORMAT_LAYOUT_32_32_32: + if (ZeImageFormat.x == ZE_IMAGE_FORMAT_SWIZZLE_R && + ZeImageFormat.y == ZE_IMAGE_FORMAT_SWIZZLE_G) { + switch (ZeImageFormat.z) { + case ZE_IMAGE_FORMAT_SWIZZLE_B: + ChannelOrder = UR_IMAGE_CHANNEL_ORDER_RGB; + break; + case ZE_IMAGE_FORMAT_SWIZZLE_X: + ChannelOrder = UR_IMAGE_CHANNEL_ORDER_RGX; + break; + default: + logger::error( + "ze2urImageFormat: unexpected image format channel z: z = {}\n", + ZeImageFormat.z); + return UR_RESULT_ERROR_INVALID_VALUE; + } + } else { + logger::error("ze2urImageFormat: unexpected image format channel"); return UR_RESULT_ERROR_INVALID_VALUE; } break; @@ -243,6 +271,9 @@ ur_result_t ur2zeImageDesc(const ur_image_format_t *ImageFormat, ZeStruct &ZeImageDesc) { auto [ZeImageFormatType, ZeImageFormatTypeSize] = getImageFormatTypeAndSize(ImageFormat); + if (ZeImageFormatTypeSize == 0) { + return UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT; + } // TODO: populate the layout mapping ze_image_format_layout_t ZeImageFormatLayout; switch (ImageFormat->channelOrder) { @@ -303,6 +334,24 @@ ur_result_t ur2zeImageDesc(const ur_image_format_t *ImageFormat, } break; } + case UR_IMAGE_CHANNEL_ORDER_RGB: + case UR_IMAGE_CHANNEL_ORDER_RGX: { + switch (ZeImageFormatTypeSize) { + case 8: + ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_8_8_8; + break; + case 16: + ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_16_16_16; + break; + case 32: + ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_32_32_32; + break; + default: + logger::error("ur2zeImageDesc: unexpected data type size"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + break; + } default: logger::error("format channel order = {}", ImageFormat->channelOrder); die("ur2zeImageDesc: unsupported image channel order\n"); @@ -444,7 +493,8 @@ ur_result_t bindlessImagesCreateImpl(ur_context_handle_t hContext, ze_image_handle_t ZeImage; ze_memory_allocation_properties_t MemAllocProperties{ - ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES}; + ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES, nullptr, + ZE_MEMORY_TYPE_UNKNOWN, 0, 0}; ZE2UR_CALL(zeMemGetAllocProperties, (hContext->ZeContext, reinterpret_cast(hImageMem), &MemAllocProperties, nullptr)); @@ -496,6 +546,8 @@ ur_result_t bindlessImagesCreateImpl(ur_context_handle_t hContext, (ZeImageTranslated, &DeviceOffset)); *phImage = DeviceOffset; + hDevice->ZeOffsetToImageHandleMap[*phImage] = ZeImage; + return UR_RESULT_SUCCESS; } @@ -624,16 +676,20 @@ getImageFormatTypeAndSize(const ur_image_format_t *ImageFormat) { logger::error( "urMemImageCreate: unsupported image data type: data type = {}", ImageFormat->channelType); - ur::unreachable(); + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_FORCE_UINT32; + ZeImageFormatTypeSize = 0; } return {ZeImageFormatType, ZeImageFormatTypeSize}; } -UR_APIEXPORT ur_result_t UR_APICALL urUSMPitchedAllocExp( - ur_context_handle_t hContext, ur_device_handle_t hDevice, - const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool, - size_t widthInBytes, size_t height, size_t elementSizeBytes, void **ppMem, - size_t *pResultPitch) { +namespace ur::level_zero { + +ur_result_t urUSMPitchedAllocExp(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + const ur_usm_desc_t *pUSMDesc, + ur_usm_pool_handle_t pool, size_t widthInBytes, + size_t height, size_t elementSizeBytes, + void **ppMem, size_t *pResultPitch) { std::shared_lock Lock(hContext->Mutex); UR_ASSERT(hContext && hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); @@ -666,33 +722,39 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPitchedAllocExp( *pResultPitch = RowPitch; size_t Size = height * RowPitch; - UR_CALL(urUSMDeviceAlloc(hContext, hDevice, pUSMDesc, pool, Size, ppMem)); + UR_CALL(ur::level_zero::urUSMDeviceAlloc(hContext, hDevice, pUSMDesc, pool, + Size, ppMem)); return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL -urBindlessImagesUnsampledImageHandleDestroyExp( +ur_result_t urBindlessImagesUnsampledImageHandleDestroyExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_exp_image_native_handle_t hImage) { - std::ignore = hContext; - std::ignore = hDevice; - std::ignore = hImage; + UR_ASSERT(hContext && hDevice && hImage, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + + auto item = hDevice->ZeOffsetToImageHandleMap.find(hImage); + + if (item != hDevice->ZeOffsetToImageHandleMap.end()) { + ZE2UR_CALL(zeImageDestroy, (item->second)); + hDevice->ZeOffsetToImageHandleMap.erase(item); + } else { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL -urBindlessImagesSampledImageHandleDestroyExp( +ur_result_t urBindlessImagesSampledImageHandleDestroyExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_exp_image_native_handle_t hImage) { // Sampled image is a combination of unsampled image and sampler. // Sampler is released in urSamplerRelease. - return urBindlessImagesUnsampledImageHandleDestroyExp(hContext, hDevice, - hImage); + return ur::level_zero::urBindlessImagesUnsampledImageHandleDestroyExp( + hContext, hDevice, hImage); } -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp( +ur_result_t urBindlessImagesImageAllocateExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, ur_exp_image_mem_native_handle_t *phImageMem) { @@ -721,16 +783,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageFreeExp( - ur_context_handle_t hContext, ur_device_handle_t hDevice, - ur_exp_image_mem_native_handle_t hImageMem) { +ur_result_t +urBindlessImagesImageFreeExp(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + ur_exp_image_mem_native_handle_t hImageMem) { std::ignore = hContext; std::ignore = hDevice; - UR_CALL(urMemRelease(reinterpret_cast(hImageMem))); + UR_CALL(ur::level_zero::urMemRelease( + reinterpret_cast(hImageMem))); return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( +ur_result_t urBindlessImagesUnsampledImageCreateExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_exp_image_mem_native_handle_t hImageMem, const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, @@ -740,7 +804,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( +ur_result_t urBindlessImagesSampledImageCreateExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_exp_image_mem_native_handle_t hImageMem, const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, @@ -750,8 +814,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_handle_legacy_t_::bindlessImagesImageCopyExp( - [[maybe_unused]] const void *pSrc, [[maybe_unused]] void *pDst, +ur_result_t urBindlessImagesImageCopyExp( + ur_queue_handle_t hQueue, [[maybe_unused]] const void *pSrc, + [[maybe_unused]] void *pDst, [[maybe_unused]] const ur_image_desc_t *pSrcImageDesc, [[maybe_unused]] const ur_image_desc_t *pDstImageDesc, [[maybe_unused]] const ur_image_format_t *pSrcImageFormat, @@ -761,7 +826,6 @@ ur_result_t ur_queue_handle_legacy_t_::bindlessImagesImageCopyExp( [[maybe_unused]] uint32_t numEventsInWaitList, [[maybe_unused]] const ur_event_handle_t *phEventWaitList, [[maybe_unused]] ur_event_handle_t *phEvent) { - auto hQueue = this; std::scoped_lock Lock(hQueue->Mutex); UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); @@ -911,7 +975,7 @@ ur_result_t ur_queue_handle_legacy_t_::bindlessImagesImageCopyExp( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageGetInfoExp( +ur_result_t urBindlessImagesImageGetInfoExp( ur_context_handle_t, ur_exp_image_mem_native_handle_t hImageMem, ur_image_info_t propName, void *pPropValue, size_t *pPropSizeRet) { UR_ASSERT(hImageMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE); @@ -961,7 +1025,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageGetInfoExp( } } -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMipmapGetLevelExp( +ur_result_t urBindlessImagesMipmapGetLevelExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_exp_image_mem_native_handle_t hImageMem, uint32_t mipmapLevel, ur_exp_image_mem_native_handle_t *phImageMem) { @@ -975,26 +1039,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMipmapGetLevelExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMipmapFreeExp( - ur_context_handle_t hContext, ur_device_handle_t hDevice, - ur_exp_image_mem_native_handle_t hMem) { - return urBindlessImagesImageFreeExp(hContext, hDevice, hMem); +ur_result_t +urBindlessImagesMipmapFreeExp(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + ur_exp_image_mem_native_handle_t hMem) { + return ur::level_zero::urBindlessImagesImageFreeExp(hContext, hDevice, hMem); } -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( +ur_result_t urBindlessImagesImportExternalMemoryExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size, ur_exp_external_mem_type_t memHandleType, - ur_exp_interop_mem_desc_t *pInteropMemDesc, - ur_exp_interop_mem_handle_t *phInteropMem) { + ur_exp_external_mem_desc_t *pExternalMemDesc, + ur_exp_external_mem_handle_t *phExternalMem) { UR_ASSERT(hContext && hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(pInteropMemDesc && phInteropMem, + UR_ASSERT(pExternalMemDesc && phExternalMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); struct ur_ze_external_memory_data *externalMemoryData = new struct ur_ze_external_memory_data; - void *pNext = const_cast(pInteropMemDesc->pNext); + void *pNext = const_cast(pExternalMemDesc->pNext); while (pNext != nullptr) { const ur_base_desc_t *BaseDesc = static_cast(pNext); if (BaseDesc->stype == UR_STRUCTURE_TYPE_EXP_FILE_DESCRIPTOR) { @@ -1024,6 +1089,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( break; case UR_EXP_EXTERNAL_MEM_TYPE_OPAQUE_FD: default: + delete importWin32; + delete externalMemoryData; return UR_RESULT_ERROR_INVALID_VALUE; } importWin32->handle = Win32Handle->handle; @@ -1034,23 +1101,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( } externalMemoryData->size = size; - *phInteropMem = - reinterpret_cast(externalMemoryData); + *phExternalMem = + reinterpret_cast(externalMemoryData); return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( +ur_result_t urBindlessImagesMapExternalArrayExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, - ur_exp_interop_mem_handle_t hInteropMem, + ur_exp_external_mem_handle_t hExternalMem, ur_exp_image_mem_native_handle_t *phImageMem) { - UR_ASSERT(hContext && hDevice && hInteropMem, + UR_ASSERT(hContext && hDevice && hExternalMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(pImageFormat && pImageDesc, UR_RESULT_ERROR_INVALID_NULL_POINTER); struct ur_ze_external_memory_data *externalMemoryData = - reinterpret_cast(hInteropMem); + reinterpret_cast(hExternalMem); ze_image_bindless_exp_desc_t ZeImageBindlessDesc = {}; ZeImageBindlessDesc.stype = ZE_STRUCTURE_TYPE_BINDLESS_IMAGE_EXP_DESC; @@ -1074,17 +1141,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( +ur_result_t urBindlessImagesMapExternalLinearMemoryExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, uint64_t offset, + uint64_t size, ur_exp_external_mem_handle_t hExternalMem, void **phRetMem) { + std::ignore = hContext; + std::ignore = hDevice; + std::ignore = size; + std::ignore = offset; + std::ignore = hExternalMem; + std::ignore = phRetMem; + logger::error("[UR][L0] {} function not implemented!", + "{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urBindlessImagesReleaseExternalMemoryExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, - ur_exp_interop_mem_handle_t hInteropMem) { + ur_exp_external_mem_handle_t hExternalMem) { - UR_ASSERT(hContext && hDevice && hInteropMem, + UR_ASSERT(hContext && hDevice && hExternalMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE); struct ur_ze_external_memory_data *externalMemoryData = - reinterpret_cast(hInteropMem); + reinterpret_cast(hExternalMem); - UR_CALL(urMemRelease(externalMemoryData->urMemoryHandle)); + UR_CALL(ur::level_zero::urMemRelease(externalMemoryData->urMemoryHandle)); switch (externalMemoryData->type) { case UR_ZE_EXTERNAL_OPAQUE_FD: @@ -1104,36 +1185,37 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( +ur_result_t urBindlessImagesImportExternalSemaphoreExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_exp_external_semaphore_type_t semHandleType, - ur_exp_interop_semaphore_desc_t *pInteropSemaphoreDesc, - ur_exp_interop_semaphore_handle_t *phInteropSemaphoreHandle) { + ur_exp_external_semaphore_desc_t *pExternalSemaphoreDesc, + ur_exp_external_semaphore_handle_t *phExternalSemaphoreHandle) { std::ignore = hContext; std::ignore = hDevice; std::ignore = semHandleType; - std::ignore = pInteropSemaphoreDesc; - std::ignore = phInteropSemaphoreHandle; + std::ignore = pExternalSemaphoreDesc; + std::ignore = phExternalSemaphoreHandle; logger::error(logger::LegacyMessage("[UR][L0] {} function not implemented!"), "{} function not implemented!", __FUNCTION__); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( +ur_result_t urBindlessImagesReleaseExternalSemaphoreExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, - ur_exp_interop_semaphore_handle_t hInteropSemaphore) { + ur_exp_external_semaphore_handle_t hExternalSemaphore) { std::ignore = hContext; std::ignore = hDevice; - std::ignore = hInteropSemaphore; + std::ignore = hExternalSemaphore; logger::error(logger::LegacyMessage("[UR][L0] {} function not implemented!"), "{} function not implemented!", __FUNCTION__); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -ur_result_t ur_queue_handle_legacy_t_::bindlessImagesWaitExternalSemaphoreExp( - ur_exp_interop_semaphore_handle_t hSemaphore, bool hasValue, - uint64_t waitValue, uint32_t numEventsInWaitList, +ur_result_t urBindlessImagesWaitExternalSemaphoreExp( + ur_queue_handle_t hQueue, ur_exp_external_semaphore_handle_t hSemaphore, + bool hasValue, uint64_t waitValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + std::ignore = hQueue; std::ignore = hSemaphore; std::ignore = hasValue; std::ignore = waitValue; @@ -1145,10 +1227,11 @@ ur_result_t ur_queue_handle_legacy_t_::bindlessImagesWaitExternalSemaphoreExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -ur_result_t ur_queue_handle_legacy_t_::bindlessImagesSignalExternalSemaphoreExp( - ur_exp_interop_semaphore_handle_t hSemaphore, bool hasValue, - uint64_t signalValue, uint32_t numEventsInWaitList, +ur_result_t urBindlessImagesSignalExternalSemaphoreExp( + ur_queue_handle_t hQueue, ur_exp_external_semaphore_handle_t hSemaphore, + bool hasValue, uint64_t signalValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + std::ignore = hQueue; std::ignore = hSemaphore; std::ignore = hasValue; std::ignore = signalValue; @@ -1159,3 +1242,5 @@ ur_result_t ur_queue_handle_legacy_t_::bindlessImagesSignalExternalSemaphoreExp( "{} function not implemented!", __FUNCTION__); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +} // namespace ur::level_zero diff --git a/source/adapters/level_zero/image.hpp b/source/adapters/level_zero/image.hpp index 618258601d..43f37fa757 100644 --- a/source/adapters/level_zero/image.hpp +++ b/source/adapters/level_zero/image.hpp @@ -10,7 +10,7 @@ #pragma once #include -#include +#include #include #include diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index cb020395ed..c77bb22b8c 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -11,9 +11,29 @@ #include "kernel.hpp" #include "logger/ur_logger.hpp" #include "ur_api.h" -#include "ur_level_zero.hpp" +#include "ur_interface_loader.hpp" -UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( +#include "helpers/kernel_helpers.hpp" + +ur_result_t getZeKernel(ze_device_handle_t hDevice, ur_kernel_handle_t hKernel, + ze_kernel_handle_t *phZeKernel) { + if (hKernel->ZeKernelMap.empty()) { + *phZeKernel = hKernel->ZeKernel; + } else { + auto It = hKernel->ZeKernelMap.find(hDevice); + if (It == hKernel->ZeKernelMap.end()) { + /* kernel and queue don't match */ + return UR_RESULT_ERROR_INVALID_QUEUE; + } + *phZeKernel = It->second; + } + + return UR_RESULT_SUCCESS; +} + +namespace ur::level_zero { + +ur_result_t urKernelGetSuggestedLocalWorkSize( ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim, [[maybe_unused]] const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, size_t *pSuggestedLocalWorkSize) { @@ -27,81 +47,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( std::copy(pGlobalWorkSize, pGlobalWorkSize + workDim, GlobalWorkSize3D); ze_kernel_handle_t ZeKernel{}; - UR_CALL(getZeKernel(Legacy(hQueue), hKernel, &ZeKernel)); + UR_CALL(getZeKernel(hQueue->Device->ZeDevice, hKernel, &ZeKernel)); - UR_CALL(getSuggestedLocalWorkSize(Legacy(hQueue), ZeKernel, GlobalWorkSize3D, + UR_CALL(getSuggestedLocalWorkSize(hQueue->Device, ZeKernel, GlobalWorkSize3D, LocalWorkSize)); std::copy(LocalWorkSize, LocalWorkSize + workDim, pSuggestedLocalWorkSize); return UR_RESULT_SUCCESS; } -ur_result_t getZeKernel(ur_queue_handle_legacy_t hQueue, - ur_kernel_handle_t hKernel, - ze_kernel_handle_t *phZeKernel) { - auto ZeDevice = hQueue->Device->ZeDevice; - - if (hKernel->ZeKernelMap.empty()) { - *phZeKernel = hKernel->ZeKernel; - } else { - auto It = hKernel->ZeKernelMap.find(ZeDevice); - if (It == hKernel->ZeKernelMap.end()) { - /* kernel and queue don't match */ - return UR_RESULT_ERROR_INVALID_QUEUE; - } - *phZeKernel = It->second; - } - - return UR_RESULT_SUCCESS; -} - -ur_result_t getSuggestedLocalWorkSize(ur_queue_handle_legacy_t hQueue, - ze_kernel_handle_t hZeKernel, - size_t GlobalWorkSize3D[3], - uint32_t SuggestedLocalWorkSize3D[3]) { - uint32_t *WG = SuggestedLocalWorkSize3D; - - // We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize - // values do not fit to 32-bit that the API only supports currently. - bool SuggestGroupSize = true; - for (int I : {0, 1, 2}) { - if (GlobalWorkSize3D[I] > UINT32_MAX) { - SuggestGroupSize = false; - } - } - if (SuggestGroupSize) { - ZE2UR_CALL(zeKernelSuggestGroupSize, - (hZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1], - GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2])); - } else { - for (int I : {0, 1, 2}) { - // Try to find a I-dimension WG size that the GlobalWorkSize[I] is - // fully divisable with. Start with the max possible size in - // each dimension. - uint32_t GroupSize[] = { - hQueue->Device->ZeDeviceComputeProperties->maxGroupSizeX, - hQueue->Device->ZeDeviceComputeProperties->maxGroupSizeY, - hQueue->Device->ZeDeviceComputeProperties->maxGroupSizeZ}; - GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]); - while (GlobalWorkSize3D[I] % GroupSize[I]) { - --GroupSize[I]; - } - if (GlobalWorkSize3D[I] / GroupSize[I] > UINT32_MAX) { - logger::error("getSuggestedLocalWorkSize: can't find a WG size " - "suitable for global work size > UINT32_MAX"); - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } - WG[I] = GroupSize[I]; - } - logger::debug( - "getSuggestedLocalWorkSize: using computed WG size = {{{}, {}, {}}}", - WG[0], WG[1], WG[2]); - } - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_handle_legacy_t_::enqueueKernelLaunch( +ur_result_t urEnqueueKernelLaunch( + ur_queue_handle_t Queue, ///< [in] handle of the queue object ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object uint32_t WorkDim, ///< [in] number of dimensions, from 1 to 3, to specify ///< the global and work-group work-items @@ -133,9 +89,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueKernelLaunch( UR_ASSERT(WorkDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(WorkDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - auto Queue = this; ze_kernel_handle_t ZeKernel{}; - UR_CALL(getZeKernel(Queue, Kernel, &ZeKernel)); + UR_CALL(getZeKernel(Queue->Device->ZeDevice, Kernel, &ZeKernel)); // Lock automatically releases when this goes out of scope. std::scoped_lock Lock( @@ -158,7 +113,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueKernelLaunch( char **ZeHandlePtr = nullptr; if (Arg.Value) { UR_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); } ZE2UR_CALL(zeKernelSetArgumentValue, (ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr)); @@ -168,68 +124,9 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueKernelLaunch( ze_group_count_t ZeThreadGroupDimensions{1, 1, 1}; uint32_t WG[3]{}; - // New variable needed because GlobalWorkSize parameter might not be of size 3 - size_t GlobalWorkSize3D[3]{1, 1, 1}; - std::copy(GlobalWorkSize, GlobalWorkSize + WorkDim, GlobalWorkSize3D); - - if (LocalWorkSize) { - for (uint32_t I = 0; I < WorkDim; ++I) { - UR_ASSERT(LocalWorkSize[I] < (std::numeric_limits::max)(), - UR_RESULT_ERROR_INVALID_VALUE); - WG[I] = static_cast(LocalWorkSize[I]); - } - } else { - UR_CALL(getSuggestedLocalWorkSize(Queue, ZeKernel, GlobalWorkSize3D, WG)); - } - - // TODO: assert if sizes do not fit into 32-bit? - - switch (WorkDim) { - case 3: - ZeThreadGroupDimensions.groupCountX = - static_cast(GlobalWorkSize3D[0] / WG[0]); - ZeThreadGroupDimensions.groupCountY = - static_cast(GlobalWorkSize3D[1] / WG[1]); - ZeThreadGroupDimensions.groupCountZ = - static_cast(GlobalWorkSize3D[2] / WG[2]); - break; - case 2: - ZeThreadGroupDimensions.groupCountX = - static_cast(GlobalWorkSize3D[0] / WG[0]); - ZeThreadGroupDimensions.groupCountY = - static_cast(GlobalWorkSize3D[1] / WG[1]); - WG[2] = 1; - break; - case 1: - ZeThreadGroupDimensions.groupCountX = - static_cast(GlobalWorkSize3D[0] / WG[0]); - WG[1] = WG[2] = 1; - break; - - default: - logger::error("urEnqueueKernelLaunch: unsupported work_dim"); - return UR_RESULT_ERROR_INVALID_VALUE; - } - - // Error handling for non-uniform group size case - if (GlobalWorkSize3D[0] != - size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) { - logger::error("urEnqueueKernelLaunch: invalid work_dim. The range is not a " - "multiple of the group size in the 1st dimension"); - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } - if (GlobalWorkSize3D[1] != - size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) { - logger::error("urEnqueueKernelLaunch: invalid work_dim. The range is not a " - "multiple of the group size in the 2nd dimension"); - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } - if (GlobalWorkSize3D[2] != - size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) { - logger::debug("urEnqueueKernelLaunch: invalid work_dim. The range is not a " - "multiple of the group size in the 3rd dimension"); - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } + UR_CALL(calculateKernelWorkDimensions(Kernel->ZeKernel, Queue->Device, + ZeThreadGroupDimensions, WG, WorkDim, + GlobalWorkSize, LocalWorkSize)); ZE2UR_CALL(zeKernelSetGroupSize, (ZeKernel, WG[0], WG[1], WG[2])); @@ -264,7 +161,7 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueKernelLaunch( // is in use. Once the event has been signalled, the code in // CleanupCompletedEvent(Event) will do a urKernelRelease to update the // reference count on the kernel, using the kernel saved in CommandData. - UR_CALL(urKernelRetain(Kernel)); + UR_CALL(ur::level_zero::urKernelRetain(Kernel)); // Add to list of kernels to be submitted if (IndirectAccessTrackingEnabled) @@ -310,7 +207,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueKernelLaunch( return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_handle_legacy_t_::enqueueCooperativeKernelLaunchExp( +ur_result_t urEnqueueCooperativeKernelLaunchExp( + ur_queue_handle_t Queue, ///< [in] handle of the queue object ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object uint32_t WorkDim, ///< [in] number of dimensions, from 1 to 3, to specify ///< the global and work-group work-items @@ -342,7 +240,6 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueCooperativeKernelLaunchExp( UR_ASSERT(WorkDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(WorkDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - auto Queue = this; auto ZeDevice = Queue->Device->ZeDevice; ze_kernel_handle_t ZeKernel{}; @@ -377,7 +274,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueCooperativeKernelLaunchExp( char **ZeHandlePtr = nullptr; if (Arg.Value) { UR_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); } ZE2UR_CALL(zeKernelSetArgumentValue, (ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr)); @@ -528,7 +426,7 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueCooperativeKernelLaunchExp( // is in use. Once the event has been signalled, the code in // CleanupCompletedEvent(Event) will do a urKernelRelease to update the // reference count on the kernel, using the kernel saved in CommandData. - UR_CALL(urKernelRetain(Kernel)); + UR_CALL(ur::level_zero::urKernelRetain(Kernel)); // Add to list of kernels to be submitted if (IndirectAccessTrackingEnabled) @@ -574,7 +472,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueCooperativeKernelLaunchExp( return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_handle_legacy_t_::enqueueDeviceGlobalVariableWrite( +ur_result_t urEnqueueDeviceGlobalVariableWrite( + ur_queue_handle_t Queue, ///< [in] handle of the queue to submit to. ur_program_handle_t Program, ///< [in] handle of the program containing the ///< device global variable. const char @@ -595,14 +494,21 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueDeviceGlobalVariableWrite( *Event ///< [in,out][optional] return an event object that identifies ///< this particular kernel execution instance. ) { - auto Queue = this; std::scoped_lock lock(Queue->Mutex); + ze_module_handle_t ZeModule{}; + auto It = Program->ZeModuleMap.find(Queue->Device->ZeDevice); + if (It != Program->ZeModuleMap.end()) { + ZeModule = It->second; + } else { + ZeModule = Program->ZeModule; + } + // Find global variable pointer size_t GlobalVarSize = 0; void *GlobalVarPtr = nullptr; ZE2UR_CALL(zeModuleGetGlobalPointer, - (Program->ZeModule, Name, &GlobalVarSize, &GlobalVarPtr)); + (ZeModule, Name, &GlobalVarSize, &GlobalVarPtr)); if (GlobalVarSize < Offset + Count) { setErrorMessage("Write device global variable is out of range.", UR_RESULT_ERROR_INVALID_VALUE, @@ -613,6 +519,11 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueDeviceGlobalVariableWrite( // Copy engine is preferred only for host to device transfer. // Device to device transfers run faster on compute engines. bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src); + // For better performance, Copy Engines are not preferred given Shared + // pointers on DG2. + if (Queue->Device->isDG2() && IsSharedPointer(Queue->Context, Src)) { + PreferCopyEngine = false; + } // Temporary option added to use copy engine for D2D copy PreferCopyEngine |= UseCopyEngineForD2DCopy; @@ -623,36 +534,43 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueDeviceGlobalVariableWrite( EventWaitList, Event, PreferCopyEngine); } -ur_result_t ur_queue_handle_legacy_t_::enqueueDeviceGlobalVariableRead( - ur_program_handle_t Program, ///< [in] handle of the program containing - ///< the device global variable. - const char *Name, ///< [in] the unique identifier for the device global - ///< variable. +ur_result_t urEnqueueDeviceGlobalVariableRead( + ur_queue_handle_t Queue, ///< [in] handle of the queue to submit to. + ur_program_handle_t Program, ///< [in] handle of the program containing the + ///< device global variable. + const char + *Name, ///< [in] the unique identifier for the device global variable. bool BlockingRead, ///< [in] indicates if this operation should block. size_t Count, ///< [in] the number of bytes to copy. - size_t Offset, ///< [in] the byte offset into the device global variable - ///< to start copying. - void *Dst, ///< [in] pointer to where the data must be copied to. + size_t Offset, ///< [in] the byte offset into the device global variable to + ///< start copying. + void *Dst, ///< [in] pointer to where the data must be copied to. uint32_t NumEventsInWaitList, ///< [in] size of the event wait list. const ur_event_handle_t *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] - ///< pointer to a list of events that must be - ///< complete before the kernel execution. If - ///< nullptr, the numEventsInWaitList must be 0, - ///< indicating that no wait event. + ///< pointer to a list of events that must be complete + ///< before the kernel execution. If nullptr, the + ///< numEventsInWaitList must be 0, indicating that no + ///< wait event. ur_event_handle_t - *Event ///< [in,out][optional] return an event object that - ///< identifies this particular kernel execution instance. + *Event ///< [in,out][optional] return an event object that identifies + ///< this particular kernel execution instance. ) { - auto Queue = this; - std::scoped_lock lock(Queue->Mutex); + ze_module_handle_t ZeModule{}; + auto It = Program->ZeModuleMap.find(Queue->Device->ZeDevice); + if (It != Program->ZeModuleMap.end()) { + ZeModule = It->second; + } else { + ZeModule = Program->ZeModule; + } + // Find global variable pointer size_t GlobalVarSize = 0; void *GlobalVarPtr = nullptr; ZE2UR_CALL(zeModuleGetGlobalPointer, - (Program->ZeModule, Name, &GlobalVarSize, &GlobalVarPtr)); + (ZeModule, Name, &GlobalVarSize, &GlobalVarPtr)); if (GlobalVarSize < Offset + Count) { setErrorMessage("Read from device global variable is out of range.", UR_RESULT_ERROR_INVALID_VALUE, @@ -663,6 +581,11 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueDeviceGlobalVariableRead( // Copy engine is preferred only for host to device transfer. // Device to device transfers run faster on compute engines. bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Dst); + // For better performance, Copy Engines are not preferred given Shared + // pointers on DG2. + if (Queue->Device->isDG2() && IsSharedPointer(Queue->Context, Dst)) { + PreferCopyEngine = false; + } // Temporary option added to use copy engine for D2D copy PreferCopyEngine |= UseCopyEngineForD2DCopy; @@ -673,7 +596,7 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueDeviceGlobalVariableRead( EventWaitList, Event, PreferCopyEngine); } -UR_APIEXPORT ur_result_t UR_APICALL urKernelCreate( +ur_result_t urKernelCreate( ur_program_handle_t Program, ///< [in] handle of the program instance const char *KernelName, ///< [in] pointer to null-terminated string. ur_kernel_handle_t @@ -700,7 +623,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreate( ZeKernelDesc.pKernelName = KernelName; ze_kernel_handle_t ZeKernel; - ZE2UR_CALL(zeKernelCreate, (ZeModule, &ZeKernelDesc, &ZeKernel)); + auto ZeResult = + ZE_CALL_NOCHECK(zeKernelCreate, (ZeModule, &ZeKernelDesc, &ZeKernel)); + // Gracefully handle the case that kernel create fails. + if (ZeResult != ZE_RESULT_SUCCESS) { + delete *RetKernel; + *RetKernel = nullptr; + return ze2urResult(ZeResult); + } auto ZeDevice = It.first; @@ -729,7 +659,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreate( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue( +ur_result_t urKernelSetArgValue( ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object uint32_t ArgIndex, ///< [in] argument index in range [0, num args - 1] size_t ArgSize, ///< [in] size of argument type @@ -754,23 +684,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue( PArgValue = nullptr; } + if (ArgIndex > Kernel->ZeKernelProperties->numKernelArgs - 1) { + return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX; + } + std::scoped_lock Guard(Kernel->Mutex); + ze_result_t ZeResult = ZE_RESULT_SUCCESS; if (Kernel->ZeKernelMap.empty()) { auto ZeKernel = Kernel->ZeKernel; - ZE2UR_CALL(zeKernelSetArgumentValue, - (ZeKernel, ArgIndex, ArgSize, PArgValue)); + ZeResult = ZE_CALL_NOCHECK(zeKernelSetArgumentValue, + (ZeKernel, ArgIndex, ArgSize, PArgValue)); } else { for (auto It : Kernel->ZeKernelMap) { auto ZeKernel = It.second; - ZE2UR_CALL(zeKernelSetArgumentValue, - (ZeKernel, ArgIndex, ArgSize, PArgValue)); + ZeResult = ZE_CALL_NOCHECK(zeKernelSetArgumentValue, + (ZeKernel, ArgIndex, ArgSize, PArgValue)); } } - return UR_RESULT_SUCCESS; + if (ZeResult == ZE_RESULT_ERROR_INVALID_ARGUMENT) { + return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE; + } + + return ze2urResult(ZeResult); } -UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgLocal( +ur_result_t urKernelSetArgLocal( ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object uint32_t ArgIndex, ///< [in] argument index in range [0, num args - 1] size_t ArgSize, ///< [in] size of the local buffer to be allocated by the @@ -780,12 +719,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgLocal( ) { std::ignore = Properties; - UR_CALL(urKernelSetArgValue(Kernel, ArgIndex, ArgSize, nullptr, nullptr)); + UR_CALL(ur::level_zero::urKernelSetArgValue(Kernel, ArgIndex, ArgSize, + nullptr, nullptr)); return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo( +ur_result_t urKernelGetInfo( ur_kernel_handle_t Kernel, ///< [in] handle of the Kernel object ur_kernel_info_t ParamName, ///< [in] name of the Kernel property to query size_t PropSize, ///< [in] the size of the Kernel property value. @@ -816,6 +756,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo( } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } + case UR_KERNEL_INFO_NUM_REGS: case UR_KERNEL_INFO_NUM_ARGS: return ReturnValue(uint32_t{Kernel->ZeKernelProperties->numKernelArgs}); case UR_KERNEL_INFO_REFERENCE_COUNT: @@ -846,7 +787,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo( +ur_result_t urKernelGetGroupInfo( ur_kernel_handle_t Kernel, ///< [in] handle of the Kernel object ur_device_handle_t Device, ///< [in] handle of the Device object ur_kernel_group_info_t @@ -917,6 +858,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo( case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { return ReturnValue(uint32_t{Kernel->ZeKernelProperties->privateMemSize}); } + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE: + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE: + // No corresponding enumeration in Level Zero + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: { logger::error( "Unknown ParamName in urKernelGetGroupInfo: ParamName={}(0x{})", @@ -927,7 +872,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSubGroupInfo( +ur_result_t urKernelGetSubGroupInfo( ur_kernel_handle_t Kernel, ///< [in] handle of the Kernel object ur_device_handle_t Device, ///< [in] handle of the Device object ur_kernel_sub_group_info_t @@ -958,7 +903,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSubGroupInfo( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urKernelRetain( +ur_result_t urKernelRetain( ur_kernel_handle_t Kernel ///< [in] handle for the Kernel to retain ) { Kernel->RefCount.increment(); @@ -966,7 +911,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelRetain( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urKernelRelease( +ur_result_t urKernelRelease( ur_kernel_handle_t Kernel ///< [in] handle for the Kernel to release ) { if (!Kernel->RefCount.decrementAndTest()) @@ -983,7 +928,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelRelease( } Kernel->ZeKernelMap.clear(); if (IndirectAccessTrackingEnabled) { - UR_CALL(urContextRelease(KernelProgram->Context)); + UR_CALL(ur::level_zero::urContextRelease(KernelProgram->Context)); } // do a release on the program this kernel was part of without delete of the // program handle @@ -994,7 +939,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelRelease( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( +ur_result_t urKernelSetArgPointer( ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object uint32_t ArgIndex, ///< [in] argument index in range [0, num args - 1] const ur_kernel_arg_pointer_properties_t @@ -1006,12 +951,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( std::ignore = Properties; // KernelSetArgValue is expecting a pointer to the argument - UR_CALL(urKernelSetArgValue(Kernel, ArgIndex, sizeof(const void *), nullptr, - &ArgValue)); + UR_CALL(ur::level_zero::urKernelSetArgValue( + Kernel, ArgIndex, sizeof(const void *), nullptr, &ArgValue)); return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo( +ur_result_t urKernelSetExecInfo( ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object ur_kernel_exec_info_t PropName, ///< [in] name of the execution attribute size_t PropSize, ///< [in] size in byte the attribute value @@ -1057,7 +1002,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgSampler( +ur_result_t urKernelSetArgSampler( ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object uint32_t ArgIndex, ///< [in] argument index in range [0, num args - 1] const ur_kernel_arg_sampler_properties_t @@ -1066,13 +1011,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgSampler( ) { std::ignore = Properties; std::scoped_lock Guard(Kernel->Mutex); + if (ArgIndex > Kernel->ZeKernelProperties->numKernelArgs - 1) { + return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX; + } ZE2UR_CALL(zeKernelSetArgumentValue, (Kernel->ZeKernel, ArgIndex, sizeof(void *), &ArgValue->ZeSampler)); return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( +ur_result_t urKernelSetArgMemObj( ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object uint32_t ArgIndex, ///< [in] argument index in range [0, num args - 1] const ur_kernel_arg_mem_obj_properties_t @@ -1085,6 +1033,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( // The ArgValue may be a NULL pointer in which case a NULL value is used for // the kernel argument declared as a pointer to global or constant memory. + if (ArgIndex > Kernel->ZeKernelProperties->numKernelArgs - 1) { + return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX; + } + ur_mem_handle_t_ *UrMem = ur_cast(ArgValue); ur_mem_handle_t_::access_mode_t UrAccessMode = ur_mem_handle_t_::read_write; @@ -1110,7 +1062,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( +ur_result_t urKernelGetNativeHandle( ur_kernel_handle_t Kernel, ///< [in] handle of the kernel. ur_native_handle_t *NativeKernel ///< [out] a pointer to the native handle of the kernel. @@ -1121,7 +1073,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( +ur_result_t urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, size_t localWorkSize, size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) { (void)localWorkSize; @@ -1134,7 +1086,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( +ur_result_t urKernelCreateWithNativeHandle( ur_native_handle_t NativeKernel, ///< [in] the native handle of the kernel. ur_context_handle_t Context, ///< [in] handle of the context object ur_program_handle_t Program, @@ -1170,13 +1122,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( return UR_RESULT_SUCCESS; } +ur_result_t urKernelSetSpecializationConstants( + ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object + uint32_t Count, ///< [in] the number of elements in the pSpecConstants array + const ur_specialization_constant_info_t + *SpecConstants ///< [in] array of specialization constant value + ///< descriptions +) { + std::ignore = Kernel; + std::ignore = Count; + std::ignore = SpecConstants; + logger::error(logger::LegacyMessage("[UR][L0] {} function not implemented!"), + "{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +} // namespace ur::level_zero + ur_result_t ur_kernel_handle_t_::initialize() { // Retain the program and context to show it's used by this kernel. - UR_CALL(urProgramRetain(Program)); + UR_CALL(ur::level_zero::urProgramRetain(Program)); if (IndirectAccessTrackingEnabled) // TODO: do piContextRetain without the guard - UR_CALL(urContextRetain(Program->Context)); + UR_CALL(ur::level_zero::urContextRetain(Program->Context)); // Set up how to obtain kernel properties when needed. ZeKernelProperties.Compute = [this](ze_kernel_properties_t &Properties) { @@ -1195,36 +1164,3 @@ ur_result_t ur_kernel_handle_t_::initialize() { return UR_RESULT_SUCCESS; } - -UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants( - ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object - uint32_t Count, ///< [in] the number of elements in the pSpecConstants array - const ur_specialization_constant_info_t - *SpecConstants ///< [in] array of specialization constant value - ///< descriptions -) { - std::ignore = Kernel; - std::ignore = Count; - std::ignore = SpecConstants; - logger::error(logger::LegacyMessage("[UR][L0] {} function not implemented!"), - "{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t ur_queue_handle_legacy_t_::enqueueKernelLaunchCustomExp( - ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, - const ur_exp_launch_property_t *launchPropList, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - std::ignore = hKernel; - std::ignore = workDim; - std::ignore = pGlobalWorkSize; - std::ignore = pLocalWorkSize; - std::ignore = numPropsInLaunchPropList; - std::ignore = launchPropList; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} diff --git a/source/adapters/level_zero/kernel.hpp b/source/adapters/level_zero/kernel.hpp index 3e74f4d8bc..9444ff0084 100644 --- a/source/adapters/level_zero/kernel.hpp +++ b/source/adapters/level_zero/kernel.hpp @@ -108,10 +108,5 @@ struct ur_kernel_handle_t_ : _ur_object { ZeCache ZeKernelName; }; -ur_result_t getSuggestedLocalWorkSize(ur_queue_handle_legacy_t hQueue, - ze_kernel_handle_t hZeKernel, - size_t GlobalWorkSize3D[3], - uint32_t SuggestedLocalWorkSize3D[3]); -ur_result_t getZeKernel(ur_queue_handle_legacy_t hQueue, - ur_kernel_handle_t hKernel, +ur_result_t getZeKernel(ze_device_handle_t hDevice, ur_kernel_handle_t hKernel, ze_kernel_handle_t *phZeKernel); diff --git a/source/adapters/level_zero/memory.cpp b/source/adapters/level_zero/memory.cpp index 95650a7b94..e7ff6dfea1 100644 --- a/source/adapters/level_zero/memory.cpp +++ b/source/adapters/level_zero/memory.cpp @@ -15,9 +15,11 @@ #include "context.hpp" #include "event.hpp" +#include "helpers/memory_helpers.hpp" #include "image.hpp" #include "logger/ur_logger.hpp" #include "queue.hpp" +#include "ur_interface_loader.hpp" #include "ur_level_zero.hpp" // Default to using compute engine for fill operation, but allow to @@ -42,11 +44,24 @@ bool IsDevicePointer(ur_context_handle_t Context, const void *Ptr) { return (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_DEVICE); } +// Helper function to check if a pointer is a shared pointer. +bool IsSharedPointer(ur_context_handle_t Context, const void *Ptr) { + ze_device_handle_t ZeDeviceHandle; + ZeStruct ZeMemoryAllocationProperties; + + // Query memory type of the pointer + ZE2UR_CALL(zeMemGetAllocProperties, + (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties, + &ZeDeviceHandle)); + + return (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_SHARED); +} + // Shared by all memory read/write/copy PI interfaces. // PI interfaces must have queue's and destination buffer's mutexes locked for // exclusive use and source buffer's mutex locked for shared use on entry. ur_result_t enqueueMemCopyHelper(ur_command_t CommandType, - ur_queue_handle_legacy_t Queue, void *Dst, + ur_queue_handle_t Queue, void *Dst, ur_bool_t BlockingWrite, size_t Size, const void *Src, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, @@ -99,13 +114,12 @@ ur_result_t enqueueMemCopyHelper(ur_command_t CommandType, // PI interfaces must have queue's and destination buffer's mutexes locked for // exclusive use and source buffer's mutex locked for shared use on entry. ur_result_t enqueueMemCopyRectHelper( - ur_command_t CommandType, ur_queue_handle_legacy_t Queue, - const void *SrcBuffer, void *DstBuffer, ur_rect_offset_t SrcOrigin, - ur_rect_offset_t DstOrigin, ur_rect_region_t Region, size_t SrcRowPitch, - size_t DstRowPitch, size_t SrcSlicePitch, size_t DstSlicePitch, - ur_bool_t Blocking, uint32_t NumEventsInWaitList, - const ur_event_handle_t *EventWaitList, ur_event_handle_t *OutEvent, - bool PreferCopyEngine) { + ur_command_t CommandType, ur_queue_handle_t Queue, const void *SrcBuffer, + void *DstBuffer, ur_rect_offset_t SrcOrigin, ur_rect_offset_t DstOrigin, + ur_rect_region_t Region, size_t SrcRowPitch, size_t DstRowPitch, + size_t SrcSlicePitch, size_t DstSlicePitch, ur_bool_t Blocking, + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_event_handle_t *OutEvent, bool PreferCopyEngine) { bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine); _ur_ze_event_list_t TmpWaitList; @@ -140,40 +154,13 @@ ur_result_t enqueueMemCopyRectHelper( ur_cast(ZeEvent)); printZeEventList(WaitList); - uint32_t SrcOriginX = ur_cast(SrcOrigin.x); - uint32_t SrcOriginY = ur_cast(SrcOrigin.y); - uint32_t SrcOriginZ = ur_cast(SrcOrigin.z); - - uint32_t SrcPitch = SrcRowPitch; - if (SrcPitch == 0) - SrcPitch = ur_cast(Region.width); - - if (SrcSlicePitch == 0) - SrcSlicePitch = ur_cast(Region.height) * SrcPitch; - - uint32_t DstOriginX = ur_cast(DstOrigin.x); - uint32_t DstOriginY = ur_cast(DstOrigin.y); - uint32_t DstOriginZ = ur_cast(DstOrigin.z); - - uint32_t DstPitch = DstRowPitch; - if (DstPitch == 0) - DstPitch = ur_cast(Region.width); - - if (DstSlicePitch == 0) - DstSlicePitch = ur_cast(Region.height) * DstPitch; - - uint32_t Width = ur_cast(Region.width); - uint32_t Height = ur_cast(Region.height); - uint32_t Depth = ur_cast(Region.depth); - - const ze_copy_region_t ZeSrcRegion = {SrcOriginX, SrcOriginY, SrcOriginZ, - Width, Height, Depth}; - const ze_copy_region_t ZeDstRegion = {DstOriginX, DstOriginY, DstOriginZ, - Width, Height, Depth}; + auto ZeParams = ur2zeRegionParams(SrcOrigin, DstOrigin, Region, SrcRowPitch, + DstRowPitch, SrcSlicePitch, DstSlicePitch); ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion, - (ZeCommandList, DstBuffer, &ZeDstRegion, DstPitch, DstSlicePitch, - SrcBuffer, &ZeSrcRegion, SrcPitch, SrcSlicePitch, ZeEvent, + (ZeCommandList, DstBuffer, &ZeParams.dstRegion, ZeParams.dstPitch, + ZeParams.dstSlicePitch, SrcBuffer, &ZeParams.srcRegion, + ZeParams.srcPitch, ZeParams.srcSlicePitch, ZeEvent, WaitList.Length, WaitList.ZeEventList)); logger::debug("calling zeCommandListAppendMemoryCopyRegion()"); @@ -185,9 +172,9 @@ ur_result_t enqueueMemCopyRectHelper( // PI interfaces must have queue's and buffer's mutexes locked on entry. static ur_result_t enqueueMemFillHelper(ur_command_t CommandType, - ur_queue_handle_legacy_t Queue, - void *Ptr, const void *Pattern, - size_t PatternSize, size_t Size, + ur_queue_handle_t Queue, void *Ptr, + const void *Pattern, size_t PatternSize, + size_t Size, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, ur_event_handle_t *OutEvent) { @@ -302,7 +289,7 @@ static ur_result_t ZeHostMemAllocHelper(void **ResultPtr, // indirect access, that is why explicitly retain context to be sure // that it is released after all memory allocations in this context are // released. - UR_CALL(urContextRetain(UrContext)); + UR_CALL(ur::level_zero::urContextRetain(UrContext)); } ZeStruct ZeDesc; @@ -324,7 +311,7 @@ static ur_result_t ZeHostMemAllocHelper(void **ResultPtr, // PI interfaces must have queue's and destination image's mutexes locked for // exclusive use and source image's mutex locked for shared use on entry. static ur_result_t enqueueMemImageCommandHelper( - ur_command_t CommandType, ur_queue_handle_legacy_t Queue, + ur_command_t CommandType, ur_queue_handle_t Queue, const void *Src, // image or ptr void *Dst, // image or ptr ur_bool_t IsBlocking, ur_rect_offset_t *SrcOrigin, @@ -393,7 +380,8 @@ static ur_result_t enqueueMemImageCommandHelper( char *ZeHandleSrc = nullptr; UR_CALL(SrcMem->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); ZE2UR_CALL(zeCommandListAppendImageCopyToMemory, (ZeCommandList, Dst, ur_cast(ZeHandleSrc), &ZeSrcRegion, ZeEvent, WaitList.Length, WaitList.ZeEventList)); @@ -426,7 +414,8 @@ static ur_result_t enqueueMemImageCommandHelper( char *ZeHandleDst = nullptr; UR_CALL(DstMem->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); ZE2UR_CALL(zeCommandListAppendImageCopyFromMemory, (ZeCommandList, ur_cast(ZeHandleDst), Src, &ZeDstRegion, ZeEvent, WaitList.Length, WaitList.ZeEventList)); @@ -444,9 +433,11 @@ static ur_result_t enqueueMemImageCommandHelper( char *ZeHandleSrc = nullptr; char *ZeHandleDst = nullptr; UR_CALL(SrcImage->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); UR_CALL(DstImage->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); ZE2UR_CALL(zeCommandListAppendImageCopyRegion, (ZeCommandList, ur_cast(ZeHandleDst), ur_cast(ZeHandleSrc), &ZeDstRegion, @@ -461,7 +452,10 @@ static ur_result_t enqueueMemImageCommandHelper( return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferRead( +namespace ur::level_zero { + +ur_result_t urEnqueueMemBufferRead( + ur_queue_handle_t Queue, ///< [in] handle of the queue object ur_mem_handle_t hBuffer, ///< [in] handle of the buffer object bool blockingRead, ///< [in] indicates blocking (true), non-blocking (false) size_t offset, ///< [in] offset in bytes in the buffer object @@ -479,7 +473,6 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferRead( *phEvent ///< [in,out][optional] return an event object that identifies ///< this particular command instance. ) { - auto Queue = this; ur_mem_handle_t_ *Src = ur_cast(hBuffer); std::shared_lock SrcLock(Src->Mutex, std::defer_lock); @@ -488,14 +481,16 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferRead( char *ZeHandleSrc = nullptr; UR_CALL(Src->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, - Queue->Device)); + Queue->Device, phEventWaitList, + numEventsInWaitList)); return enqueueMemCopyHelper(UR_COMMAND_MEM_BUFFER_READ, Queue, pDst, blockingRead, size, ZeHandleSrc + offset, numEventsInWaitList, phEventWaitList, phEvent, true /* PreferCopyEngine */); } -ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferWrite( +ur_result_t urEnqueueMemBufferWrite( + ur_queue_handle_t Queue, ///< [in] handle of the queue object ur_mem_handle_t hBuffer, ///< [in] handle of the buffer object bool blockingWrite, ///< [in] indicates blocking (true), non-blocking (false) @@ -515,7 +510,6 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferWrite( *phEvent ///< [in,out][optional] return an event object that identifies ///< this particular command instance. ) { - auto Queue = this; ur_mem_handle_t_ *Buffer = ur_cast(hBuffer); std::scoped_lock Lock(Queue->Mutex, @@ -523,7 +517,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferWrite( char *ZeHandleDst = nullptr; UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - Queue->Device)); + Queue->Device, phEventWaitList, + numEventsInWaitList)); return enqueueMemCopyHelper(UR_COMMAND_MEM_BUFFER_WRITE, Queue, ZeHandleDst + offset, // dst blockingWrite, size, @@ -532,7 +527,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferWrite( true /* PreferCopyEngine */); } -ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferReadRect( +ur_result_t urEnqueueMemBufferReadRect( + ur_queue_handle_t Queue, ///< [in] handle of the queue object ur_mem_handle_t hBuffer, ///< [in] handle of the buffer object bool blockingRead, ///< [in] indicates blocking (true), non-blocking (false) ur_rect_offset_t bufferOffset, ///< [in] 3D offset in the buffer @@ -560,7 +556,6 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferReadRect( *phEvent ///< [in,out][optional] return an event object that identifies ///< this particular command instance. ) { - auto Queue = this; ur_mem_handle_t_ *Buffer = ur_cast(hBuffer); std::shared_lock SrcLock(Buffer->Mutex, std::defer_lock); @@ -569,7 +564,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferReadRect( char *ZeHandleSrc; UR_CALL(Buffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, - Queue->Device)); + Queue->Device, phEventWaitList, + numEventsInWaitList)); return enqueueMemCopyRectHelper( UR_COMMAND_MEM_BUFFER_READ_RECT, Queue, ZeHandleSrc, pDst, bufferOffset, hostOffset, region, bufferRowPitch, hostRowPitch, bufferSlicePitch, @@ -577,7 +573,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferReadRect( phEvent); } -ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferWriteRect( +ur_result_t urEnqueueMemBufferWriteRect( + ur_queue_handle_t Queue, ///< [in] handle of the queue object ur_mem_handle_t hBuffer, ///< [in] handle of the buffer object bool blockingWrite, ///< [in] indicates blocking (true), non-blocking (false) @@ -607,7 +604,6 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferWriteRect( *phEvent ///< [in,out][optional] return an event object that identifies ///< this particular command instance. ) { - auto Queue = this; ur_mem_handle_t_ *Buffer = ur_cast(hBuffer); std::scoped_lock Lock(Queue->Mutex, @@ -615,7 +611,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferWriteRect( char *ZeHandleDst = nullptr; UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - Queue->Device)); + Queue->Device, phEventWaitList, + numEventsInWaitList)); return enqueueMemCopyRectHelper( UR_COMMAND_MEM_BUFFER_WRITE_RECT, Queue, const_cast(static_cast(pSrc)), ZeHandleDst, @@ -624,7 +621,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferWriteRect( phEventWaitList, phEvent); } -ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferCopy( +ur_result_t urEnqueueMemBufferCopy( + ur_queue_handle_t Queue, ///< [in] handle of the queue object ur_mem_handle_t BufferSrc, ///< [in] handle of the src buffer object ur_mem_handle_t BufferDst, ///< [in] handle of the dest buffer object size_t SrcOffset, ///< [in] offset into hBufferSrc to begin copying from @@ -642,7 +640,6 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferCopy( *OutEvent ///< [in,out][optional] return an event object that identifies ///< this particular command instance. ) { - auto Queue = this; _ur_buffer *SrcBuffer = ur_cast<_ur_buffer *>(BufferSrc); _ur_buffer *DstBuffer = ur_cast<_ur_buffer *>(BufferDst); @@ -663,10 +660,12 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferCopy( char *ZeHandleSrc = nullptr; UR_CALL(SrcBuffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); char *ZeHandleDst = nullptr; UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); return enqueueMemCopyHelper( UR_COMMAND_MEM_BUFFER_COPY, Queue, ZeHandleDst + DstOffset, @@ -675,9 +674,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferCopy( OutEvent, PreferCopyEngine); } -ur_result_t -ur_queue_handle_legacy_t_::enqueueMemBufferCopyRect( ///< [in] handle of the - ///< queue object +ur_result_t urEnqueueMemBufferCopyRect( + ur_queue_handle_t Queue, ///< [in] handle of the queue object ur_mem_handle_t BufferSrc, ///< [in] handle of the source buffer object ur_mem_handle_t BufferDst, ///< [in] handle of the dest buffer object ur_rect_offset_t SrcOrigin, ///< [in] 3D offset in the source buffer @@ -704,7 +702,6 @@ ur_queue_handle_legacy_t_::enqueueMemBufferCopyRect( ///< [in] handle of the *OutEvent ///< [in,out][optional] return an event object that identifies ///< this particular command instance. ) { - auto Queue = this; _ur_buffer *SrcBuffer = ur_cast<_ur_buffer *>(BufferSrc); _ur_buffer *DstBuffer = ur_cast<_ur_buffer *>(BufferDst); @@ -722,10 +719,12 @@ ur_queue_handle_legacy_t_::enqueueMemBufferCopyRect( ///< [in] handle of the char *ZeHandleSrc = nullptr; UR_CALL(SrcBuffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); char *ZeHandleDst = nullptr; UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); return enqueueMemCopyRectHelper( UR_COMMAND_MEM_BUFFER_COPY_RECT, Queue, ZeHandleSrc, ZeHandleDst, @@ -735,11 +734,12 @@ ur_queue_handle_legacy_t_::enqueueMemBufferCopyRect( ///< [in] handle of the NumEventsInWaitList, EventWaitList, OutEvent, PreferCopyEngine); } -ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferFill( - ur_mem_handle_t Buffer, ///< [in] handle of the buffer object - const void *Pattern, ///< [in] pointer to the fill pattern - size_t PatternSize, ///< [in] size in bytes of the pattern - size_t Offset, ///< [in] offset into the buffer +ur_result_t urEnqueueMemBufferFill( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_mem_handle_t Buffer, ///< [in] handle of the buffer object + const void *Pattern, ///< [in] pointer to the fill pattern + size_t PatternSize, ///< [in] size in bytes of the pattern + size_t Offset, ///< [in] offset into the buffer size_t Size, ///< [in] fill size in bytes, must be a multiple of patternSize uint32_t NumEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t @@ -753,14 +753,14 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferFill( *OutEvent ///< [in,out][optional] return an event object that identifies ///< this particular command instance. ) { - auto Queue = this; std::scoped_lock Lock(Queue->Mutex, Buffer->Mutex); char *ZeHandleDst = nullptr; _ur_buffer *UrBuffer = reinterpret_cast<_ur_buffer *>(Buffer); UR_CALL(UrBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); return enqueueMemFillHelper( UR_COMMAND_MEM_BUFFER_FILL, Queue, ZeHandleDst + Offset, Pattern, // It will be interpreted as an 8-bit value, @@ -768,8 +768,9 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferFill( Size, NumEventsInWaitList, EventWaitList, OutEvent); } -ur_result_t ur_queue_handle_legacy_t_::enqueueMemImageRead( - ur_mem_handle_t Image, ///< [in] handle of the image object +ur_result_t urEnqueueMemImageRead( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_mem_handle_t Image, ///< [in] handle of the image object bool BlockingRead, ///< [in] indicates blocking (true), non-blocking (false) ur_rect_offset_t Origin, ///< [in] defines the (x,y,z) offset in pixels in ///< the 1D, 2D, or 3D image @@ -790,7 +791,6 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemImageRead( *OutEvent ///< [in,out][optional] return an event object that identifies ///< this particular command instance. ) { - auto Queue = this; std::scoped_lock Lock(Queue->Mutex, Image->Mutex); return enqueueMemImageCommandHelper( @@ -799,8 +799,9 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemImageRead( EventWaitList, OutEvent); } -ur_result_t ur_queue_handle_legacy_t_::enqueueMemImageWrite( - ur_mem_handle_t Image, ///< [in] handle of the image object +ur_result_t urEnqueueMemImageWrite( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_mem_handle_t Image, ///< [in] handle of the image object bool BlockingWrite, ///< [in] indicates blocking (true), non-blocking (false) ur_rect_offset_t Origin, ///< [in] defines the (x,y,z) offset in pixels in @@ -822,7 +823,6 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemImageWrite( *OutEvent ///< [in,out][optional] return an event object that identifies ///< this particular command instance. ) { - auto Queue = this; std::scoped_lock Lock(Queue->Mutex, Image->Mutex); return enqueueMemImageCommandHelper( @@ -831,9 +831,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemImageWrite( EventWaitList, OutEvent); } -ur_result_t -ur_queue_handle_legacy_t_::enqueueMemImageCopy( ///< [in] handle of - ///< the queue object +ur_result_t urEnqueueMemImageCopy( + ur_queue_handle_t Queue, ///< [in] handle of the queue object ur_mem_handle_t ImageSrc, ///< [in] handle of the src image object ur_mem_handle_t ImageDst, ///< [in] handle of the dest image object ur_rect_offset_t SrcOrigin, ///< [in] defines the (x,y,z) offset in pixels @@ -854,7 +853,6 @@ ur_queue_handle_legacy_t_::enqueueMemImageCopy( ///< [in] handle of *OutEvent ///< [in,out][optional] return an event object that identifies ///< this particular command instance. ) { - auto Queue = this; std::shared_lock SrcLock(ImageSrc->Mutex, std::defer_lock); std::scoped_lock, ur_shared_mutex, ur_shared_mutex> @@ -872,8 +870,9 @@ ur_queue_handle_legacy_t_::enqueueMemImageCopy( ///< [in] handle of NumEventsInWaitList, EventWaitList, OutEvent, PreferCopyEngine); } -ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferMap( - ur_mem_handle_t Buf, ///< [in] handle of the buffer object +ur_result_t urEnqueueMemBufferMap( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_mem_handle_t Buf, ///< [in] handle of the buffer object bool BlockingMap, ///< [in] indicates blocking (true), non-blocking (false) ur_map_flags_t MapFlags, ///< [in] flags for read, write, readwrite mapping size_t Offset, ///< [in] offset in bytes of the buffer region being mapped @@ -892,7 +891,6 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferMap( void **RetMap ///< [in,out] return mapped pointer. TODO: move it before ///< numEventsInWaitList? ) { - auto Queue = this; auto Buffer = ur_cast<_ur_buffer *>(Buf); UR_ASSERT(!Buffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); @@ -951,16 +949,17 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferMap( if (Buffer->OnHost) { // Wait on incoming events before doing the copy if (NumEventsInWaitList > 0) - UR_CALL(urEventWait(NumEventsInWaitList, EventWaitList)); + UR_CALL(ur::level_zero::urEventWait(NumEventsInWaitList, EventWaitList)); if (Queue->isInOrderQueue()) - UR_CALL(urQueueFinish(Queue)); + UR_CALL(ur::level_zero::urQueueFinish(Queue)); // Lock automatically releases when this goes out of scope. std::scoped_lock Guard(Buffer->Mutex); char *ZeHandleSrc; - UR_CALL(Buffer->getZeHandle(ZeHandleSrc, AccessMode, Queue->Device)); + UR_CALL(Buffer->getZeHandle(ZeHandleSrc, AccessMode, Queue->Device, + EventWaitList, NumEventsInWaitList)); if (Buffer->MapHostPtr) { *RetMap = Buffer->MapHostPtr + Offset; @@ -1017,7 +1016,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferMap( const auto &WaitList = (*Event)->WaitList; char *ZeHandleSrc; - UR_CALL(Buffer->getZeHandle(ZeHandleSrc, AccessMode, Queue->Device)); + UR_CALL(Buffer->getZeHandle(ZeHandleSrc, AccessMode, Queue->Device, + EventWaitList, NumEventsInWaitList)); UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, NumEventsInWaitList, EventWaitList, @@ -1040,7 +1040,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemBufferMap( return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_handle_legacy_t_::enqueueMemUnmap( +ur_result_t urEnqueueMemUnmap( + ur_queue_handle_t Queue, ///< [in] handle of the queue object ur_mem_handle_t Mem, ///< [in] handle of the memory (buffer or image) object void *MappedPtr, ///< [in] mapped host address uint32_t NumEventsInWaitList, ///< [in] size of the event wait list @@ -1055,7 +1056,6 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemUnmap( *OutEvent ///< [in,out][optional] return an event object that identifies ///< this particular command instance. ) { - auto Queue = this; UR_ASSERT(!Mem->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); auto Buffer = ur_cast<_ur_buffer *>(Mem); @@ -1107,14 +1107,15 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemUnmap( if (Buffer->OnHost) { // Wait on incoming events before doing the copy if (NumEventsInWaitList > 0) - UR_CALL(urEventWait(NumEventsInWaitList, EventWaitList)); + UR_CALL(ur::level_zero::urEventWait(NumEventsInWaitList, EventWaitList)); if (Queue->isInOrderQueue()) - UR_CALL(urQueueFinish(Queue)); + UR_CALL(ur::level_zero::urQueueFinish(Queue)); char *ZeHandleDst; UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); std::scoped_lock Guard(Buffer->Mutex); if (Buffer->MapHostPtr) @@ -1133,8 +1134,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemUnmap( ur_command_list_ptr_t CommandList{}; UR_CALL(Queue->Context->getAvailableCommandList( - reinterpret_cast(Queue), CommandList, - UseCopyEngine, NumEventsInWaitList, EventWaitList)); + reinterpret_cast(Queue), CommandList, UseCopyEngine, + NumEventsInWaitList, EventWaitList)); CommandList->second.append(reinterpret_cast(*Event)); (*Event)->RefCount.increment(); @@ -1149,7 +1150,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemUnmap( char *ZeHandleDst; UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - Queue->Device)); + Queue->Device, EventWaitList, + NumEventsInWaitList)); UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, NumEventsInWaitList, EventWaitList, @@ -1167,8 +1169,9 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueMemUnmap( return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_handle_legacy_t_::enqueueUSMMemcpy( - bool Blocking, ///< [in] blocking or non-blocking copy +ur_result_t urEnqueueUSMMemcpy( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + bool Blocking, ///< [in] blocking or non-blocking copy void *Dst, ///< [in] pointer to the destination USM memory object const void *Src, ///< [in] pointer to the source USM memory object size_t Size, ///< [in] size in bytes to be copied @@ -1184,13 +1187,18 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueUSMMemcpy( *OutEvent ///< [in,out][optional] return an event object that identifies ///< this particular command instance. ) { - auto Queue = this; std::scoped_lock lock(Queue->Mutex); // Device to Device copies are found to execute slower on copy engine // (versus compute engine). bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src) || !IsDevicePointer(Queue->Context, Dst); + // For better performance, Copy Engines are not preferred given Shared + // pointers on DG2. + if (Queue->Device->isDG2() && (IsSharedPointer(Queue->Context, Src) || + IsSharedPointer(Queue->Context, Dst))) { + PreferCopyEngine = false; + } // Temporary option added to use copy engine for D2D copy PreferCopyEngine |= UseCopyEngineForD2DCopy; @@ -1200,7 +1208,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueUSMMemcpy( NumEventsInWaitList, EventWaitList, OutEvent, PreferCopyEngine); } -ur_result_t ur_queue_handle_legacy_t_::enqueueUSMPrefetch( +ur_result_t urEnqueueUSMPrefetch( + ur_queue_handle_t Queue, ///< [in] handle of the queue object const void *Mem, ///< [in] pointer to the USM memory object size_t Size, ///< [in] size in bytes to be fetched ur_usm_migration_flags_t Flags, ///< [in] USM prefetch flags @@ -1216,7 +1225,6 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueUSMPrefetch( *OutEvent ///< [in,out][optional] return an event object that identifies ///< this particular command instance. ) { - auto Queue = this; std::ignore = Flags; // Lock automatically releases when this goes out of scope. std::scoped_lock lock(Queue->Mutex); @@ -1268,7 +1276,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueUSMPrefetch( return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_handle_legacy_t_::enqueueUSMAdvise( +ur_result_t urEnqueueUSMAdvise( + ur_queue_handle_t Queue, ///< [in] handle of the queue object const void *Mem, ///< [in] pointer to the USM memory object size_t Size, ///< [in] size in bytes to be advised ur_usm_advice_flags_t Advice, ///< [in] USM memory advice @@ -1276,7 +1285,6 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueUSMAdvise( *OutEvent ///< [in,out][optional] return an event object that identifies ///< this particular command instance. ) { - auto Queue = this; // Lock automatically releases when this goes out of scope. std::scoped_lock lock(Queue->Mutex); @@ -1326,8 +1334,9 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueUSMAdvise( return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_handle_legacy_t_::enqueueUSMFill2D( - void *Mem, ///< [in] pointer to memory to be filled. +ur_result_t urEnqueueUSMFill2D( + ur_queue_handle_t Queue, ///< [in] handle of the queue to submit to. + void *Mem, ///< [in] pointer to memory to be filled. size_t Pitch, ///< [in] the total width of the destination memory including ///< padding. size_t PatternSize, ///< [in] the size in bytes of the pattern. @@ -1345,6 +1354,7 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueUSMFill2D( *OutEvent ///< [in,out][optional] return an event object that identifies ///< this particular kernel execution instance. ) { + std::ignore = Queue; std::ignore = Mem; std::ignore = Pitch; std::ignore = PatternSize; @@ -1359,7 +1369,8 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueUSMFill2D( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -ur_result_t ur_queue_handle_legacy_t_::enqueueUSMMemcpy2D( +ur_result_t urEnqueueUSMMemcpy2D( + ur_queue_handle_t Queue, ///< [in] handle of the queue to submit to. bool Blocking, ///< [in] indicates if this operation should block the host. void *Dst, ///< [in] pointer to memory where data will be copied. size_t DstPitch, ///< [in] the total width of the source memory including @@ -1380,7 +1391,6 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueUSMMemcpy2D( *Event ///< [in,out][optional] return an event object that identifies ///< this particular kernel execution instance. ) { - auto Queue = this; ur_rect_offset_t ZeroOffset{0, 0, 0}; ur_rect_region_t Region{Width, Height, 0}; @@ -1390,6 +1400,12 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueUSMMemcpy2D( // (versus compute engine). bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src) || !IsDevicePointer(Queue->Context, Dst); + // For better performance, Copy Engines are not preferred given Shared + // pointers on DG2. + if (Queue->Device->isDG2() && (IsSharedPointer(Queue->Context, Src) || + IsSharedPointer(Queue->Context, Dst))) { + PreferCopyEngine = false; + } // Temporary option added to use copy engine for D2D copy PreferCopyEngine |= UseCopyEngineForD2DCopy; @@ -1409,6 +1425,10 @@ static ur_result_t ur2zeImageDesc(const ur_image_format_t *ImageFormat, auto [ZeImageFormatType, ZeImageFormatTypeSize] = getImageFormatTypeAndSize(ImageFormat); + if (ZeImageFormatTypeSize == 0) { + return UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT; + } + // TODO: populate the layout mapping ze_image_format_layout_t ZeImageFormatLayout; switch (ImageFormat->channelOrder) { @@ -1475,7 +1495,7 @@ static ur_result_t ur2zeImageDesc(const ur_image_format_t *ImageFormat, return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( +ur_result_t urMemImageCreate( ur_context_handle_t Context, ///< [in] handle of the context object ur_mem_flags_t Flags, ///< [in] allocation and usage information flags const ur_image_format_t @@ -1524,7 +1544,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle( +ur_result_t urMemImageCreateWithNativeHandle( ur_native_handle_t NativeMem, ///< [in] the native handle to the memory. ur_context_handle_t Context, ///< [in] handle of the context object. [[maybe_unused]] const ur_image_format_t @@ -1552,7 +1572,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( +ur_result_t urMemBufferCreate( ur_context_handle_t Context, ///< [in] handle of the context object ur_mem_flags_t Flags, ///< [in] allocation and usage information flags size_t Size, ///< [in] size in bytes of the memory object to be allocated @@ -1574,30 +1594,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( Host = Properties->pHost; } - // If USM Import feature is enabled and hostptr is supplied, - // import the hostptr if not already imported into USM. - // Data transfer rate is maximized when both source and destination - // are USM pointers. Promotion of the host pointer to USM thus - // optimizes data transfer performance. bool HostPtrImported = false; - if (ZeUSMImport.Enabled && Host != nullptr && - (Flags & UR_MEM_FLAG_USE_HOST_POINTER) != 0) { - // Query memory type of the host pointer - ze_device_handle_t ZeDeviceHandle; - ZeStruct ZeMemoryAllocationProperties; - ZE2UR_CALL(zeMemGetAllocProperties, - (Context->ZeContext, Host, &ZeMemoryAllocationProperties, - &ZeDeviceHandle)); - - // If not shared of any type, we can import the ptr - if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN) { - // Promote the host ptr to USM host memory - ze_driver_handle_t driverHandle = - Context->getPlatform()->ZeDriverHandleExpTranslated; - ZeUSMImport.doZeUSMImport(driverHandle, Host, Size); - HostPtrImported = true; - } - } + if (Flags & UR_MEM_FLAG_USE_HOST_POINTER) + HostPtrImported = + maybeImportUSM(Context->getPlatform()->ZeDriverHandleExpTranslated, + Context->ZeContext, Host, Size); _ur_buffer *Buffer = nullptr; auto HostPtrOrNull = (Flags & UR_MEM_FLAG_USE_HOST_POINTER) @@ -1621,7 +1622,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( // allocation. char *ZeHandleDst; UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - Context->Devices[0])); + Context->Devices[0], nullptr, 0u)); if (Buffer->OnHost) { // Do a host to host copy. // For an imported HostPtr the copy is unneeded. @@ -1646,14 +1647,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urMemRetain( +ur_result_t urMemRetain( ur_mem_handle_t Mem ///< [in] handle of the memory object to get access ) { Mem->RefCount.increment(); return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urMemRelease( +ur_result_t urMemRelease( ur_mem_handle_t Mem ///< [in] handle of the memory object to release ) { if (!Mem->RefCount.decrementAndTest()) @@ -1663,7 +1664,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease( char *ZeHandleImage; auto Image = static_cast<_ur_image *>(Mem); if (Image->OwnNativeHandle) { - UR_CALL(Mem->getZeHandle(ZeHandleImage, ur_mem_handle_t_::write_only)); + UR_CALL(Mem->getZeHandle(ZeHandleImage, ur_mem_handle_t_::write_only, + nullptr, nullptr, 0u)); auto ZeResult = ZE_CALL_NOCHECK( zeImageDestroy, (ur_cast(ZeHandleImage))); // Gracefully handle the case that L0 was already unloaded. @@ -1679,7 +1681,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( +ur_result_t urMemBufferPartition( ur_mem_handle_t Buffer, ///< [in] handle of the buffer object to allocate from ur_mem_flags_t Flags, ///< [in] allocation and usage information flags @@ -1715,7 +1717,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urMemGetNativeHandle( +ur_result_t urMemGetNativeHandle( ur_mem_handle_t Mem, ///< [in] handle of the mem. ur_device_handle_t, ///< [in] handle of the device. ur_native_handle_t @@ -1723,13 +1725,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetNativeHandle( ) { std::shared_lock Guard(Mem->Mutex); char *ZeHandle = nullptr; - UR_CALL(Mem->getZeHandle(ZeHandle, ur_mem_handle_t_::read_write)); + UR_CALL(Mem->getZeHandle(ZeHandle, ur_mem_handle_t_::read_write, nullptr, + nullptr, 0u)); *NativeMem = ur_cast(ZeHandle); return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle( +ur_result_t urMemBufferCreateWithNativeHandle( ur_native_handle_t NativeMem, ///< [in] the native handle to the memory. ur_context_handle_t Context, ///< [in] handle of the context object. const ur_mem_native_properties_t @@ -1796,7 +1799,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle( ContextsLock.lock(); // Retain context to be sure that it is released after all memory // allocations in this context are released. - UR_CALL(urContextRetain(Context)); + UR_CALL(ur::level_zero::urContextRetain(Context)); Context->MemAllocs.emplace(std::piecewise_construct, std::forward_as_tuple(Ptr), @@ -1814,8 +1817,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle( // represent the buffer in this context) copy the data to a newly // created device allocation. char *ZeHandleDst; - UR_CALL( - Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, Device)); + UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, + Device, nullptr, 0u)); // Indicate that this buffer has the device buffer mapped to a native buffer // and track the native pointer such that the memory is synced later at @@ -1832,7 +1835,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo( +ur_result_t urMemGetInfo( ur_mem_handle_t Memory, ///< [in] handle to the memory object being queried. ur_mem_info_t MemInfoType, ///< [in] type of the info to retrieve. size_t PropSize, ///< [in] the number of bytes of memory pointed to by @@ -1868,7 +1871,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo( +ur_result_t urMemImageGetInfo( ur_mem_handle_t Memory, ///< [in] handle to the image object being queried. ur_image_info_t ImgInfoType, ///< [in] type of image info to retrieve. size_t PropSize, ///< [in] the number of bytes of memory pointer to by @@ -1891,6 +1894,79 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +ur_result_t urEnqueueUSMFill( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + void *Ptr, ///< [in] pointer to USM memory object + size_t PatternSize, ///< [in] the size in bytes of the pattern. Must be a + ///< power of 2 and less than or equal to width. + const void *Pattern, ///< [in] pointer with the bytes of the pattern to set. + size_t Size, ///< [in] size in bytes to be set. Must be a multiple of + ///< patternSize. + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before this command can be executed. If nullptr, the + ///< numEventsInWaitList must be 0, indicating that this + ///< command does not wait on any event to complete. + ur_event_handle_t *Event ///< [out][optional] return an event object that + ///< identifies this particular command instance. +) { + std::scoped_lock Lock(Queue->Mutex); + + return enqueueMemFillHelper( + // TODO: do we need a new command type for USM memset? + UR_COMMAND_MEM_BUFFER_FILL, Queue, Ptr, + Pattern, // It will be interpreted as an 8-bit value, + PatternSize, // which is indicated with this pattern_size==1 + Size, NumEventsInWaitList, EventWaitList, Event); +} + +/// Host Pipes +ur_result_t urEnqueueReadHostPipe(ur_queue_handle_t hQueue, + ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, + void *pDst, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + std::ignore = hQueue; + std::ignore = hProgram; + std::ignore = pipe_symbol; + std::ignore = blocking; + std::ignore = pDst; + std::ignore = size; + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + logger::error(logger::LegacyMessage("[UR][L0] {} function not implemented!"), + "{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urEnqueueWriteHostPipe(ur_queue_handle_t hQueue, + ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, + void *pSrc, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + std::ignore = hQueue; + std::ignore = hProgram; + std::ignore = pipe_symbol; + std::ignore = blocking; + std::ignore = pSrc; + std::ignore = size; + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + logger::error(logger::LegacyMessage("[UR][L0] {} function not implemented!"), + "{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +} // namespace ur::level_zero + // If indirect access tracking is enabled then performs reference counting, // otherwise just calls zeMemAllocDevice. static ur_result_t ZeDeviceMemAllocHelper(void **ResultPtr, @@ -1910,7 +1986,7 @@ static ur_result_t ZeDeviceMemAllocHelper(void **ResultPtr, // indirect access, that is why explicitly retain context to be sure // that it is released after all memory allocations in this context are // released. - UR_CALL(urContextRetain(Context)); + UR_CALL(ur::level_zero::urContextRetain(Context)); } ze_device_mem_alloc_desc_t ZeDesc = {}; @@ -1929,7 +2005,9 @@ static ur_result_t ZeDeviceMemAllocHelper(void **ResultPtr, } ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, - ur_device_handle_t Device) { + ur_device_handle_t Device, + const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents) { // NOTE: There might be no valid allocation at all yet and we get // here from piEnqueueKernelLaunch that would be doing the buffer @@ -1944,18 +2022,30 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, auto &Allocation = Allocations[Device]; + if (this->isFreed) { + die("getZeHandle() buffer already released, no valid handles."); + } + // Sub-buffers don't maintain own allocations but rely on parent buffer. if (SubBuffer) { - UR_CALL(SubBuffer->Parent->getZeHandle(ZeHandle, AccessMode, Device)); - ZeHandle += SubBuffer->Origin; - // Still store the allocation info in the PI sub-buffer for - // getZeHandlePtr to work. At least zeKernelSetArgumentValue needs to - // be given a pointer to the allocation handle rather than its value. - // - Allocation.ZeHandle = ZeHandle; - Allocation.ReleaseAction = allocation_t::keep; - LastDeviceWithValidAllocation = Device; - return UR_RESULT_SUCCESS; + // Verify that the Parent Buffer is still valid or if it has been freed. + if (SubBuffer->Parent && !SubBuffer->Parent->isFreed) { + UR_CALL(SubBuffer->Parent->getZeHandle(ZeHandle, AccessMode, Device, + phWaitEvents, numWaitEvents)); + ZeHandle += SubBuffer->Origin; + // Still store the allocation info in the PI sub-buffer for + // getZeHandlePtr to work. At least zeKernelSetArgumentValue needs to + // be given a pointer to the allocation handle rather than its value. + // + Allocation.ZeHandle = ZeHandle; + Allocation.ReleaseAction = allocation_t::keep; + LastDeviceWithValidAllocation = Device; + return UR_RESULT_SUCCESS; + } else { + // Return an error if the parent buffer is already gone. + die("getZeHandle() SubBuffer's parent already released, no valid " + "handles."); + } } // First handle case where the buffer is represented by only @@ -1970,8 +2060,9 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, ur_usm_desc_t USMDesc{}; USMDesc.align = getAlignment(); ur_usm_pool_handle_t Pool{}; - UR_CALL(urUSMHostAlloc(UrContext, &USMDesc, Pool, Size, - reinterpret_cast(&ZeHandle))); + UR_CALL(ur::level_zero::urUSMHostAlloc( + UrContext, &USMDesc, Pool, Size, + reinterpret_cast(&ZeHandle))); } else { HostAllocation.ReleaseAction = allocation_t::free_native; UR_CALL(ZeHostMemAllocHelper(reinterpret_cast(&ZeHandle), @@ -2018,7 +2109,8 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, // TODO: we can probably generalize this and share root-device // allocations by its own sub-devices even if not all other // devices in the context have the same root. - UR_CALL(getZeHandle(ZeHandle, AccessMode, UrContext->SingleRootDevice)); + UR_CALL(getZeHandle(ZeHandle, AccessMode, UrContext->SingleRootDevice, + phWaitEvents, numWaitEvents)); Allocation.ReleaseAction = allocation_t::keep; Allocation.ZeHandle = ZeHandle; Allocation.Valid = true; @@ -2029,8 +2121,9 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, ur_usm_desc_t USMDesc{}; USMDesc.align = getAlignment(); ur_usm_pool_handle_t Pool{}; - UR_CALL(urUSMDeviceAlloc(UrContext, Device, &USMDesc, Pool, Size, - reinterpret_cast(&ZeHandle))); + UR_CALL(ur::level_zero::urUSMDeviceAlloc( + UrContext, Device, &USMDesc, Pool, Size, + reinterpret_cast(&ZeHandle))); } else { Allocation.ReleaseAction = allocation_t::free_native; UR_CALL(ZeDeviceMemAllocHelper(reinterpret_cast(&ZeHandle), @@ -2060,7 +2153,8 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, char *ZeHandleSrc = nullptr; if (NeedCopy) { UR_CALL(getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, - LastDeviceWithValidAllocation)); + LastDeviceWithValidAllocation, phWaitEvents, + numWaitEvents)); // It's possible with the single root-device contexts that // the buffer is represented by the single root-device // allocation and then skip the copy to itself. @@ -2069,6 +2163,33 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, } if (NeedCopy) { + // Wait on all dependency events passed in to ensure that the memory which + // is being init is updated correctly. + _ur_ze_event_list_t waitlist; + waitlist.ZeEventList = nullptr; + waitlist.Length = 0; + uint32_t EventListIndex = 0; + for (unsigned i = 0; i < numWaitEvents; ++i) { + if (phWaitEvents[i]->HostVisibleEvent) { + ZE2UR_CALL(zeEventHostSynchronize, + (phWaitEvents[i]->ZeEvent, UINT64_MAX)); + } else { + // Generate the waitlist for the Copy calls based on the passed in + // dependencies, if they exist for device only. + if (waitlist.ZeEventList == nullptr) { + waitlist.ZeEventList = new ze_event_handle_t[numWaitEvents]; + } + waitlist.ZeEventList[EventListIndex] = phWaitEvents[i]->ZeEvent; + waitlist.Length++; + EventListIndex++; + } + } + if (waitlist.Length > 0) { + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (UrContext->ZeCommandListInit, waitlist.Length, + waitlist.ZeEventList)); + } + // Copy valid buffer data to this allocation. // TODO: see if we should better use peer's device allocation used // directly, if that capability is reported with zeDeviceCanAccessPeer, @@ -2093,8 +2214,8 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, ur_usm_desc_t USMDesc{}; USMDesc.align = getAlignment(); ur_usm_pool_handle_t Pool{}; - UR_CALL( - urUSMHostAlloc(UrContext, &USMDesc, Pool, Size, &ZeHandleHost)); + UR_CALL(ur::level_zero::urUSMHostAlloc(UrContext, &USMDesc, Pool, + Size, &ZeHandleHost)); } else { HostAllocation.ReleaseAction = allocation_t::free_native; UR_CALL(ZeHostMemAllocHelper(&ZeHandleHost, UrContext, Size)); @@ -2106,7 +2227,7 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, if (!HostAllocation.Valid) { ZE2UR_CALL(zeCommandListAppendMemoryCopy, (UrContext->ZeCommandListInit, HostAllocation.ZeHandle, - ZeHandleSrc, Size, nullptr, 0, nullptr)); + ZeHandleSrc, Size, nullptr, 0u, nullptr)); // Mark the host allocation data as valid so it can be reused. // It will be invalidated below if the current access is not // read-only. @@ -2114,13 +2235,16 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, } ZE2UR_CALL(zeCommandListAppendMemoryCopy, (UrContext->ZeCommandListInit, ZeHandle, - HostAllocation.ZeHandle, Size, nullptr, 0, nullptr)); + HostAllocation.ZeHandle, Size, nullptr, 0u, nullptr)); } else { // Perform P2P copy. std::scoped_lock Lock(UrContext->ImmediateCommandListMutex); ZE2UR_CALL(zeCommandListAppendMemoryCopy, (UrContext->ZeCommandListInit, ZeHandle, ZeHandleSrc, Size, - nullptr, 0, nullptr)); + nullptr, 0u, nullptr)); + } + if (waitlist.ZeEventList) { + delete waitlist.ZeEventList; } } Allocation.Valid = true; @@ -2184,6 +2308,7 @@ ur_result_t _ur_buffer::free() { die("_ur_buffer::free(): Unhandled release action"); } ZeHandle = nullptr; // don't leave hanging pointers + this->isFreed = true; } return UR_RESULT_SUCCESS; } @@ -2249,9 +2374,12 @@ _ur_buffer::_ur_buffer(ur_context_handle_t Context, size_t Size, ur_result_t _ur_buffer::getZeHandlePtr(char **&ZeHandlePtr, access_mode_t AccessMode, - ur_device_handle_t Device) { + ur_device_handle_t Device, + const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents) { char *ZeHandle; - UR_CALL(getZeHandle(ZeHandle, AccessMode, Device)); + UR_CALL( + getZeHandle(ZeHandle, AccessMode, Device, phWaitEvents, numWaitEvents)); ZeHandlePtr = &Allocations[Device].ZeHandle; return UR_RESULT_SUCCESS; } @@ -2276,66 +2404,3 @@ size_t _ur_buffer::getAlignment() const { Alignment = 1UL; return Alignment; } - -ur_result_t ur_queue_handle_legacy_t_::enqueueUSMFill( - void *Ptr, ///< [in] pointer to USM memory object - size_t PatternSize, ///< [in] the size in bytes of the pattern. Must be a - ///< power of 2 and less than or equal to width. - const void *Pattern, ///< [in] pointer with the bytes of the pattern to set. - size_t Size, ///< [in] size in bytes to be set. Must be a multiple of - ///< patternSize. - uint32_t NumEventsInWaitList, ///< [in] size of the event wait list - const ur_event_handle_t * - EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] - ///< pointer to a list of events that must be complete - ///< before this command can be executed. If nullptr, the - ///< numEventsInWaitList must be 0, indicating that this - ///< command does not wait on any event to complete. - ur_event_handle_t *Event ///< [out][optional] return an event object that - ///< identifies this particular command instance. -) { - auto Queue = this; - std::scoped_lock Lock(Queue->Mutex); - - return enqueueMemFillHelper( - // TODO: do we need a new command type for USM memset? - UR_COMMAND_MEM_BUFFER_FILL, Queue, Ptr, - Pattern, // It will be interpreted as an 8-bit value, - PatternSize, // which is indicated with this pattern_size==1 - Size, NumEventsInWaitList, EventWaitList, Event); -} - -/// Host Pipes -ur_result_t ur_queue_handle_legacy_t_::enqueueReadHostPipe( - ur_program_handle_t hProgram, const char *pipe_symbol, bool blocking, - void *pDst, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = hProgram; - std::ignore = pipe_symbol; - std::ignore = blocking; - std::ignore = pDst; - std::ignore = size; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - logger::error(logger::LegacyMessage("[UR][L0] {} function not implemented!"), - "{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t ur_queue_handle_legacy_t_::enqueueWriteHostPipe( - ur_program_handle_t hProgram, const char *pipe_symbol, bool blocking, - void *pSrc, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = hProgram; - std::ignore = pipe_symbol; - std::ignore = blocking; - std::ignore = pSrc; - std::ignore = size; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - logger::error(logger::LegacyMessage("[UR][L0] {} function not implemented!"), - "{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} diff --git a/source/adapters/level_zero/memory.hpp b/source/adapters/level_zero/memory.hpp index b590165947..b8e683e16e 100644 --- a/source/adapters/level_zero/memory.hpp +++ b/source/adapters/level_zero/memory.hpp @@ -20,18 +20,16 @@ #include #include -#include +#include #include #include #include "ur_level_zero.hpp" -struct ur_queue_handle_legacy_t_; -using ur_queue_handle_legacy_t = ur_queue_handle_legacy_t_ *; - struct ur_device_handle_t_; bool IsDevicePointer(ur_context_handle_t Context, const void *Ptr); +bool IsSharedPointer(ur_context_handle_t Context, const void *Ptr); // This is an experimental option to test performance of device to device copy // operations on copy engines (versus compute engine) @@ -47,7 +45,7 @@ const bool UseCopyEngineForD2DCopy = [] { // PI interfaces must have queue's and destination buffer's mutexes locked for // exclusive use and source buffer's mutex locked for shared use on entry. ur_result_t enqueueMemCopyHelper(ur_command_t CommandType, - ur_queue_handle_legacy_t Queue, void *Dst, + ur_queue_handle_t Queue, void *Dst, ur_bool_t BlockingWrite, size_t Size, const void *Src, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, @@ -55,13 +53,12 @@ ur_result_t enqueueMemCopyHelper(ur_command_t CommandType, bool PreferCopyEngine); ur_result_t enqueueMemCopyRectHelper( - ur_command_t CommandType, ur_queue_handle_legacy_t Queue, - const void *SrcBuffer, void *DstBuffer, ur_rect_offset_t SrcOrigin, - ur_rect_offset_t DstOrigin, ur_rect_region_t Region, size_t SrcRowPitch, - size_t DstRowPitch, size_t SrcSlicePitch, size_t DstSlicePitch, - ur_bool_t Blocking, uint32_t NumEventsInWaitList, - const ur_event_handle_t *EventWaitList, ur_event_handle_t *OutEvent, - bool PreferCopyEngine = false); + ur_command_t CommandType, ur_queue_handle_t Queue, const void *SrcBuffer, + void *DstBuffer, ur_rect_offset_t SrcOrigin, ur_rect_offset_t DstOrigin, + ur_rect_region_t Region, size_t SrcRowPitch, size_t DstRowPitch, + size_t SrcSlicePitch, size_t DstSlicePitch, ur_bool_t Blocking, + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_event_handle_t *OutEvent, bool PreferCopyEngine = false); struct ur_mem_handle_t_ : _ur_object { // Keeps the PI context of this memory handle. @@ -77,11 +74,15 @@ struct ur_mem_handle_t_ : _ur_object { // Get the Level Zero handle of the current memory object virtual ur_result_t getZeHandle(char *&ZeHandle, access_mode_t, - ur_device_handle_t Device = nullptr) = 0; + ur_device_handle_t Device, + const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents) = 0; // Get a pointer to the Level Zero handle of the current memory object virtual ur_result_t getZeHandlePtr(char **&ZeHandlePtr, access_mode_t, - ur_device_handle_t Device = nullptr) = 0; + ur_device_handle_t Device, + const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents) = 0; // Method to get type of the derived object (image or buffer) virtual bool isImage() const = 0; @@ -107,7 +108,10 @@ struct _ur_buffer final : ur_mem_handle_t_ { // Sub-buffer constructor _ur_buffer(_ur_buffer *Parent, size_t Origin, size_t Size) : ur_mem_handle_t_(Parent->UrContext), - Size(Size), SubBuffer{{Parent, Origin}} {} + Size(Size), SubBuffer{{Parent, Origin}} { + // Retain the Parent Buffer due to the Creation of the SubBuffer. + Parent->RefCount.increment(); + } // Interop-buffer constructor _ur_buffer(ur_context_handle_t Context, size_t Size, @@ -121,10 +125,13 @@ struct _ur_buffer final : ur_mem_handle_t_ { // the hood. // virtual ur_result_t getZeHandle(char *&ZeHandle, access_mode_t, - ur_device_handle_t Device = nullptr) override; - virtual ur_result_t - getZeHandlePtr(char **&ZeHandlePtr, access_mode_t, - ur_device_handle_t Device = nullptr) override; + ur_device_handle_t Device, + const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents) override; + virtual ur_result_t getZeHandlePtr(char **&ZeHandlePtr, access_mode_t, + ur_device_handle_t Device, + const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents) override; bool isImage() const override { return false; } bool isSubBuffer() const { return SubBuffer != std::nullopt; } @@ -132,6 +139,9 @@ struct _ur_buffer final : ur_mem_handle_t_ { // Frees all allocations made for the buffer. ur_result_t free(); + // Tracks if this buffer is freed already or should be considered valid. + bool isFreed{false}; + // Information about a single allocation representing this buffer. struct allocation_t { // Level Zero memory handle is really just a naked pointer. @@ -202,12 +212,20 @@ struct _ur_image final : ur_mem_handle_t_ { } virtual ur_result_t getZeHandle(char *&ZeHandle, access_mode_t, - ur_device_handle_t = nullptr) override { + ur_device_handle_t, + const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents) override { + std::ignore = phWaitEvents; + std::ignore = numWaitEvents; ZeHandle = reinterpret_cast(ZeImage); return UR_RESULT_SUCCESS; } virtual ur_result_t getZeHandlePtr(char **&ZeHandlePtr, access_mode_t, - ur_device_handle_t = nullptr) override { + ur_device_handle_t, + const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents) override { + std::ignore = phWaitEvents; + std::ignore = numWaitEvents; ZeHandlePtr = reinterpret_cast(&ZeImage); return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/physical_mem.cpp b/source/adapters/level_zero/physical_mem.cpp index d4d9792f24..e7bb498859 100644 --- a/source/adapters/level_zero/physical_mem.cpp +++ b/source/adapters/level_zero/physical_mem.cpp @@ -14,7 +14,9 @@ #include "device.hpp" #include "ur_level_zero.hpp" -UR_APIEXPORT ur_result_t UR_APICALL urPhysicalMemCreate( +namespace ur::level_zero { + +ur_result_t urPhysicalMemCreate( ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size, [[maybe_unused]] const ur_physical_mem_properties_t *pProperties, ur_physical_mem_handle_t *phPhysicalMem) { @@ -35,14 +37,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urPhysicalMemCreate( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL -urPhysicalMemRetain(ur_physical_mem_handle_t hPhysicalMem) { +ur_result_t urPhysicalMemRetain(ur_physical_mem_handle_t hPhysicalMem) { hPhysicalMem->RefCount.increment(); return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL -urPhysicalMemRelease(ur_physical_mem_handle_t hPhysicalMem) { +ur_result_t urPhysicalMemRelease(ur_physical_mem_handle_t hPhysicalMem) { if (!hPhysicalMem->RefCount.decrementAndTest()) return UR_RESULT_SUCCESS; @@ -52,3 +52,4 @@ urPhysicalMemRelease(ur_physical_mem_handle_t hPhysicalMem) { return UR_RESULT_SUCCESS; } +} // namespace ur::level_zero diff --git a/source/adapters/level_zero/platform.cpp b/source/adapters/level_zero/platform.cpp index 02b3663710..721db3c359 100644 --- a/source/adapters/level_zero/platform.cpp +++ b/source/adapters/level_zero/platform.cpp @@ -12,7 +12,9 @@ #include "adapter.hpp" #include "ur_level_zero.hpp" -UR_APIEXPORT ur_result_t UR_APICALL urPlatformGet( +namespace ur::level_zero { + +ur_result_t urPlatformGet( ur_adapter_handle_t *, uint32_t, uint32_t NumEntries, ///< [in] the number of platforms to be added to ///< phPlatforms. If phPlatforms is not NULL, then @@ -47,7 +49,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGet( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetInfo( +ur_result_t urPlatformGetInfo( ur_platform_handle_t Platform, ///< [in] handle of the platform ur_platform_info_t ParamName, ///< [in] type of the info to retrieve size_t Size, ///< [in] the number of bytes pointed to by pPlatformInfo. @@ -101,7 +103,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetInfo( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion( +ur_result_t urPlatformGetApiVersion( ur_platform_handle_t Driver, ///< [in] handle of the platform ur_api_version_t *Version ///< [out] api version ) { @@ -110,7 +112,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetNativeHandle( +ur_result_t urPlatformGetNativeHandle( ur_platform_handle_t Platform, ///< [in] handle of the platform. ur_native_handle_t *NativePlatform ///< [out] a pointer to the native ///< handle of the platform. @@ -120,7 +122,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetNativeHandle( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( +ur_result_t urPlatformCreateWithNativeHandle( ur_native_handle_t NativePlatform, ///< [in] the native handle of the platform. ur_adapter_handle_t, @@ -135,12 +137,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( uint32_t NumPlatforms = 0; ur_adapter_handle_t AdapterHandle = GlobalAdapter; - UR_CALL(urPlatformGet(&AdapterHandle, 1, 0, nullptr, &NumPlatforms)); + UR_CALL(ur::level_zero::urPlatformGet(&AdapterHandle, 1, 0, nullptr, + &NumPlatforms)); if (NumPlatforms) { std::vector Platforms(NumPlatforms); - UR_CALL(urPlatformGet(&AdapterHandle, 1, NumPlatforms, Platforms.data(), - nullptr)); + UR_CALL(ur::level_zero::urPlatformGet(&AdapterHandle, 1, NumPlatforms, + Platforms.data(), nullptr)); // The SYCL spec requires that the set of platforms must remain fixed for // the duration of the application's execution. We assume that we found all @@ -158,6 +161,46 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( return UR_RESULT_ERROR_INVALID_VALUE; } +// Returns plugin specific backend option. +// Current support is only for optimization options. +// Return '-ze-opt-disable' for frontend_option = -O0. +// Return '-ze-opt-level=2' for frontend_option = -O1, -O2 or -O3. +// Return '-igc_opts 'PartitionUnit=1,SubroutineThreshold=50000'' for +// frontend_option=-ftarget-compile-fast. +ur_result_t urPlatformGetBackendOption( + ur_platform_handle_t Platform, ///< [in] handle of the platform instance. + const char *FrontendOption, ///< [in] string containing the frontend option. + const char * + *PlatformOption ///< [out] returns the correct platform specific + ///< compiler option based on the frontend option. +) { + std::ignore = Platform; + using namespace std::literals; + if (FrontendOption == nullptr) { + return UR_RESULT_SUCCESS; + } + if (FrontendOption == ""sv) { + *PlatformOption = ""; + return UR_RESULT_SUCCESS; + } + if (FrontendOption == "-O0"sv) { + *PlatformOption = "-ze-opt-disable"; + return UR_RESULT_SUCCESS; + } + if (FrontendOption == "-O1"sv || FrontendOption == "-O2"sv || + FrontendOption == "-O3"sv) { + *PlatformOption = "-ze-opt-level=2"; + return UR_RESULT_SUCCESS; + } + if (FrontendOption == "-ftarget-compile-fast"sv) { + *PlatformOption = "-igc_opts 'PartitionUnit=1,SubroutineThreshold=50000'"; + return UR_RESULT_SUCCESS; + } + return UR_RESULT_ERROR_INVALID_VALUE; +} + +} // namespace ur::level_zero + ur_result_t ur_platform_handle_t_::initialize() { ZE2UR_CALL(zeDriverGetApiVersion, (ZeDriver, &ZeApiVersion)); ZeDriverApiVersion = std::to_string(ZE_MAJOR_VERSION(ZeApiVersion)) + "." + @@ -266,6 +309,67 @@ ur_result_t ur_platform_handle_t_::initialize() { return UR_RESULT_SUCCESS; } +/// Checks the version of the level-zero driver. +/// @param VersionMajor Major verion number to compare to. +/// @param VersionMinor Minor verion number to compare to. +/// @param VersionBuild Build verion number to compare to. +/// @return true is the version of the driver is higher than or equal to the +/// compared version +bool ur_platform_handle_t_::isDriverVersionNewerOrSimilar( + uint32_t VersionMajor, uint32_t VersionMinor, uint32_t VersionBuild) { + uint32_t DriverVersionMajor = 0; + uint32_t DriverVersionMinor = 0; + uint32_t DriverVersionBuild = 0; + if (!ZeDriverVersionString.Supported) { + ZeStruct ZeDriverProperties; + ZE2UR_CALL(zeDriverGetProperties, (ZeDriver, &ZeDriverProperties)); + uint32_t DriverVersion = ZeDriverProperties.driverVersion; + DriverVersionMajor = (DriverVersion & 0xFF000000) >> 24; + DriverVersionMinor = (DriverVersion & 0x00FF0000) >> 16; + DriverVersionBuild = DriverVersion & 0x0000FFFF; + } else { + std::string ZeDriverVersion; + size_t sizeOfDriverString = 0; + ZeDriverVersionString.getDriverVersionString(ZeDriverHandleExpTranslated, + nullptr, &sizeOfDriverString); + ZeDriverVersion.resize(sizeOfDriverString); + ZeDriverVersionString.getDriverVersionString(ZeDriverHandleExpTranslated, + ZeDriverVersion.data(), + &sizeOfDriverString); + + // Intel driver version string is in the format: + // Major.Minor.Build+Hotfix where hotfix is optional. + std::stringstream VersionString(ZeDriverVersion); + + std::string VersionValue; + std::vector VersionValues; + char VersionDelim = '.'; + char HotfixDelim = '+'; + + while (getline(VersionString, VersionValue, VersionDelim)) { + VersionValues.push_back(VersionValue); + } + // If the extension exists, but the string value comes by empty or + // malformed, assume this is a developer driver. + if (VersionValues.size() >= 3) { + DriverVersionMajor = atoi(VersionValues[0].c_str()); + DriverVersionMinor = atoi(VersionValues[1].c_str()); + std::stringstream HotfixString(VersionValues[2]); + std::vector BuildHotfixVersionValues; + // Check to see if there is a hotfix value and strip it off. + while (getline(HotfixString, VersionValue, HotfixDelim)) { + BuildHotfixVersionValues.push_back(VersionValue); + } + DriverVersionBuild = atoi(BuildHotfixVersionValues[0].c_str()); + } else { + return true; + } + } + return std::make_tuple(DriverVersionMajor, DriverVersionMinor, + DriverVersionBuild) >= + std::make_tuple(VersionMajor, VersionMinor, VersionBuild); +} + // Get the cached PI device created for the L0 device handle. // Return NULL if no such PI device found. ur_device_handle_t @@ -442,6 +546,8 @@ ur_result_t ur_platform_handle_t_::populateDeviceCacheIfNeeded() { return UR_RESULT_SUCCESS; } +size_t ur_platform_handle_t_::getNumDevices() { return URDevicesCache.size(); } + ur_device_handle_t ur_platform_handle_t_::getDeviceById(DeviceId id) { for (auto &dev : URDevicesCache) { if (dev->Id == id) { @@ -450,41 +556,3 @@ ur_device_handle_t ur_platform_handle_t_::getDeviceById(DeviceId id) { } return nullptr; } - -// Returns plugin specific backend option. -// Current support is only for optimization options. -// Return '-ze-opt-disable' for frontend_option = -O0. -// Return '-ze-opt-level=2' for frontend_option = -O1, -O2 or -O3. -// Return '-igc_opts 'PartitionUnit=1,SubroutineThreshold=50000'' for -// frontend_option=-ftarget-compile-fast. -UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetBackendOption( - ur_platform_handle_t Platform, ///< [in] handle of the platform instance. - const char *FrontendOption, ///< [in] string containing the frontend option. - const char * - *PlatformOption ///< [out] returns the correct platform specific - ///< compiler option based on the frontend option. -) { - std::ignore = Platform; - using namespace std::literals; - if (FrontendOption == nullptr) { - return UR_RESULT_SUCCESS; - } - if (FrontendOption == ""sv) { - *PlatformOption = ""; - return UR_RESULT_SUCCESS; - } - if (FrontendOption == "-O0"sv) { - *PlatformOption = "-ze-opt-disable"; - return UR_RESULT_SUCCESS; - } - if (FrontendOption == "-O1"sv || FrontendOption == "-O2"sv || - FrontendOption == "-O3"sv) { - *PlatformOption = "-ze-opt-level=2"; - return UR_RESULT_SUCCESS; - } - if (FrontendOption == "-ftarget-compile-fast"sv) { - *PlatformOption = "-igc_opts 'PartitionUnit=1,SubroutineThreshold=50000'"; - return UR_RESULT_SUCCESS; - } - return UR_RESULT_ERROR_INVALID_VALUE; -} diff --git a/source/adapters/level_zero/platform.hpp b/source/adapters/level_zero/platform.hpp index f9fdcb117e..b53b55bb23 100644 --- a/source/adapters/level_zero/platform.hpp +++ b/source/adapters/level_zero/platform.hpp @@ -12,11 +12,18 @@ #include "common.hpp" #include "ur_api.h" #include "ze_api.h" +#include "zes_api.h" struct ur_device_handle_t_; typedef size_t DeviceId; +struct ur_zes_device_handle_data_t { + zes_device_handle_t ZesDevice; + uint32_t SubDeviceId; + ze_bool_t SubDevice = false; +}; + struct ur_platform_handle_t_ : public _ur_platform { ur_platform_handle_t_(ze_driver_handle_t Driver) : ZeDriver{Driver}, ZeApiVersion{ZE_API_VERSION_CURRENT} {} @@ -27,6 +34,11 @@ struct ur_platform_handle_t_ : public _ur_platform { // a pretty good fit to keep here. ze_driver_handle_t ZeDriver; + // Cache of the ZesDevices mapped to the ZeDevices for use in zes apis calls + // based on a ze device handle. + std::unordered_map + ZedeviceToZesDeviceMap; + // Given a multi driver scenario, the driver handle must be translated to the // internal driver handle to allow calls to driver experimental apis. ze_driver_handle_t ZeDriverHandleExpTranslated; @@ -56,12 +68,19 @@ struct ur_platform_handle_t_ : public _ur_platform { // Check the device cache and load it if necessary. ur_result_t populateDeviceCacheIfNeeded(); + size_t getNumDevices(); + ur_device_handle_t getDeviceById(DeviceId); // Return the PI device from cache that represents given native device. // If not found, then nullptr is returned. ur_device_handle_t getDeviceFromNativeHandle(ze_device_handle_t); + /// Checks the version of the level-zero driver. + bool isDriverVersionNewerOrSimilar(uint32_t VersionMajor, + uint32_t VersionMinor, + uint32_t VersionBuild); + // Keep track of all contexts in the platform. This is needed to manage // a lifetime of memory allocations in each context when there are kernels // with indirect access. diff --git a/source/adapters/level_zero/program.cpp b/source/adapters/level_zero/program.cpp index 26c75aef31..5f5ec387a0 100644 --- a/source/adapters/level_zero/program.cpp +++ b/source/adapters/level_zero/program.cpp @@ -9,8 +9,15 @@ //===----------------------------------------------------------------------===// #include "program.hpp" +#include "device.hpp" #include "logger/ur_logger.hpp" -#include "ur_level_zero.hpp" +#include "ur_interface_loader.hpp" + +#ifdef UR_ADAPTER_LEVEL_ZERO_V2 +#include "v2/context.hpp" +#else +#include "context.hpp" +#endif extern "C" { // Check to see if a Level Zero module has any unresolved symbols. @@ -48,7 +55,9 @@ checkUnresolvedSymbols(ze_module_handle_t ZeModule, } } // extern "C" -UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithIL( +namespace ur::level_zero { + +ur_result_t urProgramCreateWithIL( ur_context_handle_t Context, ///< [in] handle of the context instance const void *IL, ///< [in] pointer to IL binary. size_t Length, ///< [in] length of `pIL` in bytes. @@ -58,6 +67,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithIL( *Program ///< [out] pointer to handle of program object created. ) { std::ignore = Properties; + UR_ASSERT(Context, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(IL && Program, UR_RESULT_ERROR_INVALID_NULL_POINTER); try { ur_program_handle_t_ *UrProgram = new ur_program_handle_t_(ur_program_handle_t_::IL, Context, IL, Length); @@ -71,7 +82,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithIL( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( +ur_result_t urProgramCreateWithBinary( ur_context_handle_t Context, ///< [in] handle of the context instance ur_device_handle_t Device, ///< [in] handle to device associated with binary. @@ -82,8 +93,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( ur_program_handle_t *Program ///< [out] pointer to handle of Program object created. ) { - std::ignore = Device; - std::ignore = Properties; // In OpenCL, clCreateProgramWithBinary() can be used to load any of the // following: "program executable", "compiled program", or "library of // compiled programs". In addition, the loaded program can be either @@ -96,8 +105,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( // information to distinguish the cases. try { - ur_program_handle_t_ *UrProgram = new ur_program_handle_t_( - ur_program_handle_t_::Native, Context, Binary, Size); + ur_program_handle_t_ *UrProgram = + new ur_program_handle_t_(ur_program_handle_t_::Native, Context, Device, + Properties, Binary, Size); *Program = reinterpret_cast(UrProgram); } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; @@ -108,17 +118,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild( +ur_result_t urProgramBuild( ur_context_handle_t Context, ///< [in] handle of the context instance. ur_program_handle_t Program, ///< [in] Handle of the program to build. const char *Options ///< [in][optional] pointer to build options ///< null-terminated string. ) { - return urProgramBuildExp(Program, Context->Devices.size(), - Context->Devices.data(), Options); + std::vector Devices = Context->getDevices(); + return ur::level_zero::urProgramBuildExp(Program, Devices.size(), + Devices.data(), Options); } -UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp( +ur_result_t urProgramBuildExp( ur_program_handle_t hProgram, ///< [in] Handle of the program to build. uint32_t numDevices, ///< [in] number of devices ur_device_handle_t *phDevices, ///< [in][range(0, numDevices)] pointer to @@ -173,7 +184,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp( for (uint32_t i = 0; i < numDevices; i++) { ze_device_handle_t ZeDevice = phDevices[i]->ZeDevice; - ze_context_handle_t ZeContext = hProgram->Context->ZeContext; + ze_context_handle_t ZeContext = hProgram->Context->getZeHandle(); ze_module_handle_t ZeModuleHandle = nullptr; ze_module_build_log_handle_t ZeBuildLog{}; @@ -208,12 +219,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp( } } hProgram->ZeModuleMap.insert(std::make_pair(ZeDevice, ZeModuleHandle)); - hProgram->ZeBuildLogMap.insert(std::make_pair(ZeDevice, ZeBuildLog)); } + hProgram->ZeBuildLogMap.insert(std::make_pair(ZeDevice, ZeBuildLog)); } - // We no longer need the IL / native code. - hProgram->Code.reset(); if (!hProgram->ZeModuleMap.empty()) hProgram->ZeModule = hProgram->ZeModuleMap.begin()->second; if (!hProgram->ZeBuildLogMap.empty()) @@ -221,7 +230,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp( return Result; } -UR_APIEXPORT ur_result_t UR_APICALL urProgramCompileExp( +ur_result_t urProgramCompileExp( ur_program_handle_t hProgram, ///< [in][out] handle of the program to compile. uint32_t numDevices, ///< [in] number of devices @@ -232,10 +241,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCompileExp( ) { std::ignore = numDevices; std::ignore = phDevices; - return urProgramCompile(hProgram->Context, hProgram, pOptions); + return ur::level_zero::urProgramCompile(hProgram->Context, hProgram, + pOptions); } -UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile( +ur_result_t urProgramCompile( ur_context_handle_t Context, ///< [in] handle of the context instance. ur_program_handle_t Program, ///< [in][out] handle of the program to compile. @@ -265,7 +275,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile( // ze-opt-greater-than-4GB-buffer-required to disable // stateful optimizations and be able to use larger than // 4GB allocations on these kernels. - if (Context->Devices[0]->useRelaxedAllocationLimits()) { + if (Context->getDevices()[0]->useRelaxedAllocationLimits()) { Program->BuildFlags += " -ze-opt-greater-than-4GB-buffer-required"; } } @@ -274,7 +284,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urProgramLink( +ur_result_t urProgramLink( ur_context_handle_t Context, ///< [in] handle of the context instance. uint32_t Count, ///< [in] number of program handles in `phPrograms`. const ur_program_handle_t *Programs, ///< [in][range(0, count)] pointer to @@ -284,12 +294,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramLink( ur_program_handle_t *Program ///< [out] pointer to handle of program object created. ) { - return urProgramLinkExp(Context, Context->Devices.size(), - Context->Devices.data(), Count, Programs, Options, - Program); + std::vector Devices = Context->getDevices(); + return ur::level_zero::urProgramLinkExp(Context, Devices.size(), + Devices.data(), Count, Programs, + Options, Program); } -UR_APIEXPORT ur_result_t UR_APICALL urProgramLinkExp( +ur_result_t urProgramLinkExp( ur_context_handle_t hContext, ///< [in] handle of the context instance. uint32_t numDevices, ///< [in] number of devices ur_device_handle_t *phDevices, ///< [in][range(0, numDevices)] pointer to @@ -425,7 +436,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramLinkExp( // Call the Level Zero API to compile, link, and create the module. ze_device_handle_t ZeDevice = phDevices[i]->ZeDevice; - ze_context_handle_t ZeContext = hContext->ZeContext; + ze_context_handle_t ZeContext = hContext->getZeHandle(); ze_module_handle_t ZeModule = nullptr; ze_module_build_log_handle_t ZeBuildLog = nullptr; ze_result_t ZeResult = @@ -475,14 +486,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramLinkExp( return UrResult; } -UR_APIEXPORT ur_result_t UR_APICALL urProgramRetain( +ur_result_t urProgramRetain( ur_program_handle_t Program ///< [in] handle for the Program to retain ) { Program->RefCount.increment(); return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urProgramRelease( +ur_result_t urProgramRelease( ur_program_handle_t Program ///< [in] handle for the Program to release ) { if (!Program->RefCount.decrementAndTest()) @@ -519,7 +530,7 @@ static bool is_in_separated_string(const std::string &str, char delimiter, return false; } -UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer( +ur_result_t urProgramGetFunctionPointer( ur_device_handle_t Device, ///< [in] handle of the device to retrieve pointer for. ur_program_handle_t @@ -559,12 +570,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer( if (ZeResult == ZE_RESULT_ERROR_INVALID_ARGUMENT) { size_t Size; *FunctionPointerRet = 0; - UR_CALL(urProgramGetInfo(Program, UR_PROGRAM_INFO_KERNEL_NAMES, 0, nullptr, - &Size)); + UR_CALL(ur::level_zero::urProgramGetInfo( + Program, UR_PROGRAM_INFO_KERNEL_NAMES, 0, nullptr, &Size)); std::string ClResult(Size, ' '); - UR_CALL(urProgramGetInfo(Program, UR_PROGRAM_INFO_KERNEL_NAMES, - ClResult.size(), &ClResult[0], nullptr)); + UR_CALL(ur::level_zero::urProgramGetInfo( + Program, UR_PROGRAM_INFO_KERNEL_NAMES, ClResult.size(), &ClResult[0], + nullptr)); // Get rid of the null terminator and search for kernel_name // If function can be found return error code to indicate it @@ -584,7 +596,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer( return ze2urResult(ZeResult); } -UR_APIEXPORT ur_result_t UR_APICALL urProgramGetGlobalVariablePointer( +ur_result_t urProgramGetGlobalVariablePointer( ur_device_handle_t Device, ///< [in] handle of the device to retrieve the pointer for. ur_program_handle_t @@ -597,11 +609,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetGlobalVariablePointer( void **GlobalVariablePointerRet ///< [out] Returns the pointer to the global ///< variable if it is found in the program. ) { - std::ignore = Device; std::scoped_lock lock(Program->Mutex); + ze_module_handle_t ZeModuleEntry{}; + ZeModuleEntry = Program->ZeModule; + if (!Program->ZeModuleMap.empty()) { + auto It = Program->ZeModuleMap.find(Device->ZeDevice); + if (It != Program->ZeModuleMap.end()) { + ZeModuleEntry = It->second; + } + } + ze_result_t ZeResult = - zeModuleGetGlobalPointer(Program->ZeModule, GlobalVariableName, + zeModuleGetGlobalPointer(ZeModuleEntry, GlobalVariableName, GlobalVariableSizeRet, GlobalVariablePointerRet); if (ZeResult == ZE_RESULT_ERROR_UNSUPPORTED_FEATURE) { @@ -611,7 +631,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetGlobalVariablePointer( return ze2urResult(ZeResult); } -UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo( +ur_result_t urProgramGetInfo( ur_program_handle_t Program, ///< [in] handle of the Program object ur_program_info_t PropName, ///< [in] name of the Program property to query size_t PropSize, ///< [in] the size of the Program property. @@ -632,11 +652,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo( case UR_PROGRAM_INFO_CONTEXT: return ReturnValue(Program->Context); case UR_PROGRAM_INFO_NUM_DEVICES: - // TODO: return true number of devices this program exists for. - return ReturnValue(uint32_t{1}); + if (!Program->ZeModuleMap.empty()) + return ReturnValue( + uint32_t{ur_cast(Program->ZeModuleMap.size())}); + else + return ReturnValue(uint32_t{1}); case UR_PROGRAM_INFO_DEVICES: - // TODO: return all devices this program exists for. - return ReturnValue(Program->Context->Devices[0]); + if (!Program->ZeModuleMap.empty()) { + std::vector devices; + for (auto &ZeModulePair : Program->ZeModuleMap) { + auto It = Program->ZeModuleMap.find(ZeModulePair.first); + if (It != Program->ZeModuleMap.end()) { + for (auto &Device : Program->Context->getDevices()) { + if (Device->ZeDevice == ZeModulePair.first) { + devices.push_back(Device); + } + } + } + } + return ReturnValue(devices.data(), devices.size()); + } else { + return ReturnValue(Program->Context->getDevices()[0]); + } case UR_PROGRAM_INFO_BINARY_SIZES: { std::shared_lock Guard(Program->Mutex); size_t SzBinary; @@ -645,8 +682,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo( Program->State == ur_program_handle_t_::Object) { SzBinary = Program->CodeLength; } else if (Program->State == ur_program_handle_t_::Exe) { - ZE2UR_CALL(zeModuleGetNativeBinary, - (Program->ZeModule, &SzBinary, nullptr)); + if (!Program->ZeModuleMap.empty()) { + std::vector binarySizes; + for (auto &ZeModulePair : Program->ZeModuleMap) { + size_t binarySize = 0; + ZE2UR_CALL(zeModuleGetNativeBinary, + (ZeModulePair.second, &binarySize, nullptr)); + binarySizes.push_back(binarySize); + } + return ReturnValue(binarySizes.data(), binarySizes.size()); + } else { + ZE2UR_CALL(zeModuleGetNativeBinary, + (Program->ZeModule, &SzBinary, nullptr)); + return ReturnValue(SzBinary); + } } else { return UR_RESULT_ERROR_INVALID_PROGRAM; } @@ -655,22 +704,52 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo( } case UR_PROGRAM_INFO_BINARIES: { // The caller sets "ParamValue" to an array of pointers, one for each - // device. Since Level Zero supports only one device, there is only one - // pointer. If the pointer is NULL, we don't do anything. Otherwise, we - // copy the program's binary image to the buffer at that pointer. - uint8_t **PBinary = ur_cast(ProgramInfo); - if (!PBinary[0]) - break; - + // device. + uint8_t **PBinary = nullptr; + if (ProgramInfo) { + PBinary = ur_cast(ProgramInfo); + if (!PBinary[0]) { + break; + } + } std::shared_lock Guard(Program->Mutex); + // If the caller is using a Program which is IL, Native or an object, then + // the program has not been built for multiple devices so a single IL is + // returned. if (Program->State == ur_program_handle_t_::IL || Program->State == ur_program_handle_t_::Native || Program->State == ur_program_handle_t_::Object) { - std::memcpy(PBinary[0], Program->Code.get(), Program->CodeLength); + if (PropSizeRet) + *PropSizeRet = Program->CodeLength; + if (PBinary) { + std::memcpy(PBinary[0], Program->Code.get(), Program->CodeLength); + } } else if (Program->State == ur_program_handle_t_::Exe) { + // If the caller is using a Program which is a built binary, then + // the program returned will either be a single module if this is a native + // binary or the native binary for each device will be returned. size_t SzBinary = 0; - ZE2UR_CALL(zeModuleGetNativeBinary, - (Program->ZeModule, &SzBinary, PBinary[0])); + uint8_t *NativeBinaryPtr = nullptr; + if (PBinary) { + NativeBinaryPtr = PBinary[0]; + } + if (!Program->ZeModuleMap.empty()) { + uint32_t deviceIndex = 0; + for (auto &ZeDeviceModule : Program->ZeModuleMap) { + size_t binarySize = 0; + if (PBinary) { + NativeBinaryPtr = PBinary[deviceIndex++]; + } + ZE2UR_CALL(zeModuleGetNativeBinary, + (ZeDeviceModule.second, &binarySize, NativeBinaryPtr)); + SzBinary += binarySize; + } + } else { + ZE2UR_CALL(zeModuleGetNativeBinary, + (Program->ZeModule, &SzBinary, NativeBinaryPtr)); + } + if (PropSizeRet) + *PropSizeRet = SzBinary; } else { return UR_RESULT_ERROR_INVALID_PROGRAM; } @@ -678,15 +757,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo( } case UR_PROGRAM_INFO_NUM_KERNELS: { std::shared_lock Guard(Program->Mutex); - uint32_t NumKernels; + uint32_t NumKernels = 0; if (Program->State == ur_program_handle_t_::IL || Program->State == ur_program_handle_t_::Native || Program->State == ur_program_handle_t_::Object) { return UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE; } else if (Program->State == ur_program_handle_t_::Exe) { - NumKernels = 0; - ZE2UR_CALL(zeModuleGetKernelNames, - (Program->ZeModule, &NumKernels, nullptr)); + if (!Program->ZeModuleMap.empty()) { + ZE2UR_CALL( + zeModuleGetKernelNames, + (Program->ZeModuleMap.begin()->second, &NumKernels, nullptr)); + } else { + ZE2UR_CALL(zeModuleGetKernelNames, + (Program->ZeModule, &NumKernels, nullptr)); + } } else { return UR_RESULT_ERROR_INVALID_PROGRAM; } @@ -702,11 +786,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo( return UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE; } else if (Program->State == ur_program_handle_t_::Exe) { uint32_t Count = 0; - ZE2UR_CALL(zeModuleGetKernelNames, - (Program->ZeModule, &Count, nullptr)); - std::unique_ptr PNames(new const char *[Count]); - ZE2UR_CALL(zeModuleGetKernelNames, - (Program->ZeModule, &Count, PNames.get())); + std::unique_ptr PNames; + if (!Program->ZeModuleMap.empty()) { + ZE2UR_CALL(zeModuleGetKernelNames, + (Program->ZeModuleMap.begin()->second, &Count, nullptr)); + PNames = std::make_unique(Count); + ZE2UR_CALL( + zeModuleGetKernelNames, + (Program->ZeModuleMap.begin()->second, &Count, PNames.get())); + } else { + ZE2UR_CALL(zeModuleGetKernelNames, + (Program->ZeModule, &Count, nullptr)); + PNames = std::make_unique(Count); + ZE2UR_CALL(zeModuleGetKernelNames, + (Program->ZeModule, &Count, PNames.get())); + } for (uint32_t I = 0; I < Count; ++I) { PINames += (I > 0 ? ";" : ""); PINames += PNames[I]; @@ -720,14 +814,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo( } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } + case UR_PROGRAM_INFO_IL: + return ReturnValue(Program->Code.get(), Program->CodeLength); default: - die("urProgramGetInfo: not implemented"); + return UR_RESULT_ERROR_INVALID_ENUMERATION; } return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urProgramGetBuildInfo( +ur_result_t urProgramGetBuildInfo( ur_program_handle_t Program, ///< [in] handle of the Program object ur_device_handle_t Device, ///< [in] handle of the Device object ur_program_build_info_t @@ -761,6 +857,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetBuildInfo( // return for programs that were built outside and registered // with urProgramRegister? return ReturnValue(""); + } else if (PropName == UR_PROGRAM_BUILD_INFO_STATUS) { + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; } else if (PropName == UR_PROGRAM_BUILD_INFO_LOG) { // Check first to see if the plugin code recorded an error message. if (!Program->ErrorMessage.empty()) { @@ -805,7 +903,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetBuildInfo( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urProgramSetSpecializationConstant( +ur_result_t urProgramSetSpecializationConstant( ur_program_handle_t Program, ///< [in] handle of the Program object uint32_t SpecId, ///< [in] specification constant Id size_t SpecSize, ///< [in] size of the specialization constant value @@ -820,7 +918,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramSetSpecializationConstant( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle( +ur_result_t urProgramGetNativeHandle( ur_program_handle_t Program, ///< [in] handle of the program. ur_native_handle_t *NativeProgram ///< [out] a pointer to the native ///< handle of the program. @@ -841,7 +939,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle( +ur_result_t urProgramCreateWithNativeHandle( ur_native_handle_t NativeProgram, ///< [in] the native handle of the program. ur_context_handle_t Context, ///< [in] handle of the context instance @@ -852,6 +950,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle( ///< program object created. ) { std::ignore = Properties; + UR_ASSERT(Context && NativeProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(Program, UR_RESULT_ERROR_INVALID_NULL_POINTER); auto ZeModule = ur_cast(NativeProgram); // We assume here that programs created from a native handle always @@ -871,6 +971,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle( return UR_RESULT_SUCCESS; } +ur_result_t urProgramSetSpecializationConstants( + ur_program_handle_t Program, ///< [in] handle of the Program object + uint32_t Count, ///< [in] the number of elements in the pSpecConstants array + const ur_specialization_constant_info_t + *SpecConstants ///< [in][range(0, count)] array of specialization + ///< constant value descriptions +) { + std::scoped_lock Guard(Program->Mutex); + + // Remember the value of this specialization constant until the program is + // built. Note that we only save the pointer to the buffer that contains the + // value. The caller is responsible for maintaining storage for this buffer. + // + // NOTE: SpecSize is unused in Level Zero, the size is known from SPIR-V by + // SpecID. + for (uint32_t SpecIt = 0; SpecIt < Count; SpecIt++) { + uint32_t SpecId = SpecConstants[SpecIt].id; + Program->SpecConstants[SpecId] = SpecConstants[SpecIt].pValue; + } + return UR_RESULT_SUCCESS; +} + +} // namespace ur::level_zero + ur_program_handle_t_::~ur_program_handle_t_() { if (!resourcesReleased) { ur_release_program_resources(true); @@ -905,25 +1029,3 @@ void ur_program_handle_t_::ur_release_program_resources(bool deletion) { resourcesReleased = true; } } - -UR_APIEXPORT ur_result_t UR_APICALL urProgramSetSpecializationConstants( - ur_program_handle_t Program, ///< [in] handle of the Program object - uint32_t Count, ///< [in] the number of elements in the pSpecConstants array - const ur_specialization_constant_info_t - *SpecConstants ///< [in][range(0, count)] array of specialization - ///< constant value descriptions -) { - std::scoped_lock Guard(Program->Mutex); - - // Remember the value of this specialization constant until the program is - // built. Note that we only save the pointer to the buffer that contains the - // value. The caller is responsible for maintaining storage for this buffer. - // - // NOTE: SpecSize is unused in Level Zero, the size is known from SPIR-V by - // SpecID. - for (uint32_t SpecIt = 0; SpecIt < Count; SpecIt++) { - uint32_t SpecId = SpecConstants[SpecIt].id; - Program->SpecConstants[SpecId] = SpecConstants[SpecIt].pValue; - } - return UR_RESULT_SUCCESS; -} diff --git a/source/adapters/level_zero/program.hpp b/source/adapters/level_zero/program.hpp index 8d148c8fa2..42330adcbf 100644 --- a/source/adapters/level_zero/program.hpp +++ b/source/adapters/level_zero/program.hpp @@ -65,10 +65,21 @@ struct ur_program_handle_t_ : _ur_object { ze_module_constants_t ZeSpecConstants; }; - // Construct a program in IL or Native state. + // Construct a program in IL. ur_program_handle_t_(state St, ur_context_handle_t Context, const void *Input, size_t Length) - : Context{Context}, + : Context{Context}, NativeDevice{nullptr}, NativeProperties{nullptr}, + OwnZeModule{true}, State{St}, Code{new uint8_t[Length]}, + CodeLength{Length}, ZeModule{nullptr}, ZeBuildLog{nullptr} { + std::memcpy(Code.get(), Input, Length); + } + + // Construct a program in NATIVE. + ur_program_handle_t_(state St, ur_context_handle_t Context, + ur_device_handle_t Device, + const ur_program_properties_t *Properties, + const void *Input, size_t Length) + : Context{Context}, NativeDevice(Device), NativeProperties(Properties), OwnZeModule{true}, State{St}, Code{new uint8_t[Length]}, CodeLength{Length}, ZeModule{nullptr}, ZeBuildLog{nullptr} { std::memcpy(Code.get(), Input, Length); @@ -78,26 +89,29 @@ struct ur_program_handle_t_ : _ur_object { ur_program_handle_t_(state St, ur_context_handle_t Context, ze_module_handle_t ZeModule, ze_module_build_log_handle_t ZeBuildLog) - : Context{Context}, OwnZeModule{true}, State{St}, ZeModule{ZeModule}, - ZeBuildLog{ZeBuildLog} {} + : Context{Context}, NativeDevice{nullptr}, NativeProperties{nullptr}, + OwnZeModule{true}, State{St}, ZeModule{ZeModule}, ZeBuildLog{ + ZeBuildLog} {} // Construct a program in Exe state (interop). ur_program_handle_t_(state St, ur_context_handle_t Context, ze_module_handle_t ZeModule, bool OwnZeModule) - : Context{Context}, OwnZeModule{OwnZeModule}, State{St}, - ZeModule{ZeModule}, ZeBuildLog{nullptr} {} + : Context{Context}, NativeDevice{nullptr}, NativeProperties{nullptr}, + OwnZeModule{OwnZeModule}, State{St}, ZeModule{ZeModule}, ZeBuildLog{ + nullptr} {} // Construct a program from native handle ur_program_handle_t_(state St, ur_context_handle_t Context, ze_module_handle_t ZeModule) - : Context{Context}, OwnZeModule{true}, State{St}, ZeModule{ZeModule}, - ZeBuildLog{nullptr} {} + : Context{Context}, NativeDevice{nullptr}, NativeProperties{nullptr}, + OwnZeModule{true}, State{St}, ZeModule{ZeModule}, ZeBuildLog{nullptr} {} // Construct a program in Invalid state with a custom error message. ur_program_handle_t_(state St, ur_context_handle_t Context, const std::string &ErrorMessage) - : Context{Context}, OwnZeModule{true}, ErrorMessage{ErrorMessage}, - State{St}, ZeModule{nullptr}, ZeBuildLog{nullptr} {} + : Context{Context}, NativeDevice{nullptr}, NativeProperties{nullptr}, + OwnZeModule{true}, ErrorMessage{ErrorMessage}, State{St}, + ZeModule{nullptr}, ZeBuildLog{nullptr} {} ~ur_program_handle_t_(); void ur_release_program_resources(bool deletion); @@ -108,6 +122,12 @@ struct ur_program_handle_t_ : _ur_object { const ur_context_handle_t Context; // Context of the program. + // Device Handle used for the Native Build + ur_device_handle_t NativeDevice; + + // Properties used for the Native Build + const ur_program_properties_t *NativeProperties; + // Indicates if we own the ZeModule or it came from interop that // asked to not transfer the ownership to SYCL RT. const bool OwnZeModule; diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index 7498072d95..978547df10 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -19,13 +19,11 @@ #include "common.hpp" #include "event.hpp" #include "queue.hpp" -#include "ur_api.h" +#include "ur_interface_loader.hpp" #include "ur_level_zero.hpp" #include "ur_util.hpp" #include "ze_api.h" -#include "v2/queue_factory.hpp" - // Hard limit for the event completion batches. static const uint64_t CompletionBatchesMax = [] { // Default value chosen empirically to maximize the number of asynchronous @@ -101,7 +99,7 @@ bool ur_completion_batch::checkComplete() { return st == COMPLETED; } -ur_result_t ur_completion_batch::seal(ur_queue_handle_legacy_t queue, +ur_result_t ur_completion_batch::seal(ur_queue_handle_t queue, ze_command_list_handle_t cmdlist) { assert(st == ACCUMULATING); @@ -189,7 +187,7 @@ ur_completion_batches::ur_completion_batches() { } ur_result_t ur_completion_batches::tryCleanup( - ur_queue_handle_legacy_t queue, ze_command_list_handle_t cmdlist, + ur_queue_handle_t queue, ze_command_list_handle_t cmdlist, std::vector &events, std::vector &EventListToCleanup) { cleanup(events, EventListToCleanup); @@ -231,7 +229,7 @@ void ur_completion_batches::forceReset() { /// the call, in case of in-order queue it allows to cleanup all preceding /// events. /// @return PI_SUCCESS if successful, PI error code otherwise. -ur_result_t CleanupEventsInImmCmdLists(ur_queue_handle_legacy_t UrQueue, +ur_result_t CleanupEventsInImmCmdLists(ur_queue_handle_t UrQueue, bool QueueLocked, bool QueueSynced, ur_event_handle_t CompletedEvent) { // Handle only immediate command lists here. @@ -305,7 +303,7 @@ ur_result_t CleanupEventsInImmCmdLists(ur_queue_handle_legacy_t UrQueue, /// @param Queue Queue where we look for signalled command lists and cleanup /// events. /// @return PI_SUCCESS if successful, PI error code otherwise. -ur_result_t resetCommandLists(ur_queue_handle_legacy_t Queue) { +ur_result_t resetCommandLists(ur_queue_handle_t Queue) { // Handle immediate command lists here, they don't need to be reset and we // only need to cleanup events. if (Queue->UsingImmCmdLists) { @@ -344,7 +342,10 @@ ur_result_t resetCommandLists(ur_queue_handle_legacy_t Queue) { return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_handle_legacy_t_::queueGetInfo( +namespace ur::level_zero { + +ur_result_t urQueueGetInfo( + ur_queue_handle_t Queue, ///< [in] handle of the queue object ur_queue_info_t ParamName, ///< [in] name of the queue property to query size_t ParamValueSize, ///< [in] size in bytes of the queue property value ///< provided @@ -352,8 +353,6 @@ ur_result_t ur_queue_handle_legacy_t_::queueGetInfo( size_t *ParamValueSizeRet ///< [out] size in bytes returned in queue ///< property value ) { - auto Queue = this; - std::shared_lock Lock(Queue->Mutex); UrReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); // TODO: consider support for queue properties and size @@ -469,7 +468,7 @@ static bool doEagerInit = [] { return EagerInit ? std::atoi(EagerInit) != 0 : false; }(); -UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( +ur_result_t urQueueCreate( ur_context_handle_t Context, ///< [in] handle of the context object ur_device_handle_t Device, ///< [in] handle of the device object const ur_queue_properties_t @@ -501,16 +500,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( UR_ASSERT(Context->isValidDevice(Device), UR_RESULT_ERROR_INVALID_DEVICE); - // optimized path for immediate, in-order command lists - if (v2::shouldUseQueueV2(Device, Flags)) { - *Queue = v2::createQueue(Context, Device, Props); - return UR_RESULT_SUCCESS; - } - // Create placeholder queues in the compute queue group. // Actual L0 queues will be created at first use. std::vector ZeComputeCommandQueues( - Device->QueueGroup[ur_queue_handle_legacy_t_::queue_type::Compute] + Device->QueueGroup[ur_queue_handle_t_::queue_type::Compute] .ZeProperties.numQueues, nullptr); @@ -520,21 +513,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( size_t NumCopyGroups = 0; if (Device->hasMainCopyEngine()) { NumCopyGroups += - Device->QueueGroup[ur_queue_handle_legacy_t_::queue_type::MainCopy] + Device->QueueGroup[ur_queue_handle_t_::queue_type::MainCopy] .ZeProperties.numQueues; } if (Device->hasLinkCopyEngine()) { NumCopyGroups += - Device->QueueGroup[ur_queue_handle_legacy_t_::queue_type::LinkCopy] + Device->QueueGroup[ur_queue_handle_t_::queue_type::LinkCopy] .ZeProperties.numQueues; } std::vector ZeCopyCommandQueues(NumCopyGroups, nullptr); try { - *Queue = new ur_queue_handle_legacy_t_(ZeComputeCommandQueues, - ZeCopyCommandQueues, Context, Device, - true, Flags, ForceComputeIndex); + *Queue = + new ur_queue_handle_t_(ZeComputeCommandQueues, ZeCopyCommandQueues, + Context, Device, true, Flags, ForceComputeIndex); } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } catch (...) { @@ -543,7 +536,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( // Do eager initialization of Level Zero handles on request. if (doEagerInit) { - ur_queue_handle_legacy_t Q = Legacy(*Queue); + auto Q = *Queue; // Creates said number of command-lists. auto warmupQueueGroup = [Q](bool UseCopyEngine, uint32_t RepeatCount) -> ur_result_t { @@ -584,9 +577,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_handle_legacy_t_::queueRetain() { - auto Queue = this; - +ur_result_t urQueueRetain( + ur_queue_handle_t Queue ///< [in] handle of the queue object to get access +) { { std::scoped_lock Lock(Queue->Mutex); Queue->RefCountExternal++; @@ -595,9 +588,9 @@ ur_result_t ur_queue_handle_legacy_t_::queueRetain() { return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_handle_legacy_t_::queueRelease() { - auto Queue = this; - +ur_result_t urQueueRelease( + ur_queue_handle_t Queue ///< [in] handle of the queue object to release +) { std::vector EventListToCleanup; { std::scoped_lock Lock(Queue->Mutex); @@ -607,7 +600,7 @@ ur_result_t ur_queue_handle_legacy_t_::queueRelease() { // internal reference count. When the External Reference count == 0, then // cleanup of the queue begins and the final decrement of the internal // reference count is completed. - Queue->RefCount.decrementAndTest(); + static_cast(Queue->RefCount.decrementAndTest()); return UR_RESULT_SUCCESS; } @@ -698,13 +691,12 @@ ur_result_t ur_queue_handle_legacy_t_::queueRelease() { return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_handle_legacy_t_::queueGetNativeHandle( +ur_result_t urQueueGetNativeHandle( + ur_queue_handle_t Queue, ///< [in] handle of the queue. ur_queue_native_desc_t *Desc, ur_native_handle_t *NativeQueue ///< [out] a pointer to the native handle of the queue. ) { - auto Queue = this; - // Lock automatically releases when this goes out of scope. std::shared_lock lock(Queue->Mutex); @@ -736,24 +728,7 @@ ur_result_t ur_queue_handle_legacy_t_::queueGetNativeHandle( return UR_RESULT_SUCCESS; } -void ur_queue_handle_legacy_t_::ur_queue_group_t::setImmCmdList( - ur_queue_handle_legacy_t queue, ze_command_list_handle_t ZeCommandList) { - // An immediate command list was given to us but we don't have the queue - // descriptor information. Create a dummy and note that it is not recycleable. - ZeStruct ZeQueueDesc; - - ImmCmdLists = std::vector( - 1, - Queue->CommandListMap - .insert(std::pair{ - ZeCommandList, - ur_command_list_info_t(nullptr, true, false, nullptr, ZeQueueDesc, - queue->useCompletionBatching(), false, - false, true)}) - .first); -} - -UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( +ur_result_t urQueueCreateWithNativeHandle( ur_native_handle_t NativeQueue, ///< [in] the native handle of the queue. ur_context_handle_t Context, ///< [in] handle of the context object ur_device_handle_t Device, /// @@ -793,12 +768,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( uint32_t NumEntries = 1; ur_platform_handle_t Platform{}; ur_adapter_handle_t AdapterHandle = GlobalAdapter; - UR_CALL(urPlatformGet(&AdapterHandle, 1, NumEntries, &Platform, nullptr)); + UR_CALL(ur::level_zero::urPlatformGet(&AdapterHandle, 1, NumEntries, + &Platform, nullptr)); ur_device_handle_t UrDevice = Device; if (UrDevice == nullptr) { - UR_CALL(urDeviceGet(Platform, UR_DEVICE_TYPE_GPU, NumEntries, &UrDevice, - nullptr)); + UR_CALL(ur::level_zero::urDeviceGet(Platform, UR_DEVICE_TYPE_GPU, + NumEntries, &UrDevice, nullptr)); } // The NativeHandleDesc has value if if the native handle is an immediate @@ -808,7 +784,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( std::vector CopyQueues; try { - ur_queue_handle_t_ *Queue = new ur_queue_handle_legacy_t_( + ur_queue_handle_t_ *Queue = new ur_queue_handle_t_( ComputeQueues, CopyQueues, Context, UrDevice, OwnNativeHandle, Flags); *RetQueue = reinterpret_cast(Queue); } catch (const std::bad_alloc &) { @@ -816,9 +792,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } - auto &InitialGroup = - Legacy(*RetQueue)->ComputeQueueGroupsByTID.begin()->second; - InitialGroup.setImmCmdList(Legacy(*RetQueue), + auto &InitialGroup = (*RetQueue)->ComputeQueueGroupsByTID.begin()->second; + InitialGroup.setImmCmdList(*RetQueue, ur_cast(NativeQueue)); } else { auto ZeQueue = ur_cast(NativeQueue); @@ -831,7 +806,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( std::vector ZeroCopyQueues; try { - ur_queue_handle_t_ *Queue = new ur_queue_handle_legacy_t_( + ur_queue_handle_t_ *Queue = new ur_queue_handle_t_( ZeQueues, ZeroCopyQueues, Context, UrDevice, OwnNativeHandle, Flags); *RetQueue = reinterpret_cast(Queue); } catch (const std::bad_alloc &) { @@ -840,13 +815,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( return UR_RESULT_ERROR_UNKNOWN; } } - Legacy(*RetQueue)->UsingImmCmdLists = (NativeHandleDesc == 1); + (*RetQueue)->UsingImmCmdLists = (NativeHandleDesc == 1); return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_handle_legacy_t_::queueFinish() { - auto Queue = this; +ur_result_t urQueueFinish( + ur_queue_handle_t Queue ///< [in] handle of the queue to be finished. +) { if (Queue->UsingImmCmdLists) { // Lock automatically releases when this goes out of scope. std::scoped_lock Lock(Queue->Mutex); @@ -911,12 +887,38 @@ ur_result_t ur_queue_handle_legacy_t_::queueFinish() { return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_handle_legacy_t_::queueFlush() { - auto Queue = this; +ur_result_t urQueueFlush( + ur_queue_handle_t Queue ///< [in] handle of the queue to be flushed. +) { std::scoped_lock Lock(Queue->Mutex); return Queue->executeAllOpenCommandLists(); } +ur_result_t urEnqueueKernelLaunchCustomExp( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numPropsInLaunchPropList, + const ur_exp_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + std::ignore = hQueue; + std::ignore = hKernel; + std::ignore = workDim; + std::ignore = pGlobalWorkSize; + std::ignore = pLocalWorkSize; + std::ignore = numPropsInLaunchPropList; + std::ignore = launchPropList; + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + + logger::error("[UR][L0] {} function not implemented!", + "{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +} // namespace ur::level_zero + // Configuration of the command-list batching. struct zeCommandListBatchConfig { // Default value of 0. This specifies to use dynamic batch size adjustment. @@ -1071,7 +1073,7 @@ static const zeCommandListBatchConfig ZeCommandListBatchCopyConfig = [] { return ZeCommandListBatchConfig(IsCopy{true}); }(); -ur_queue_handle_legacy_t_::ur_queue_handle_legacy_t_( +ur_queue_handle_t_::ur_queue_handle_t_( std::vector &ComputeQueues, std::vector &CopyQueues, ur_context_handle_t Context, ur_device_handle_t Device, @@ -1097,8 +1099,8 @@ ur_queue_handle_legacy_t_::ur_queue_handle_legacy_t_( // First, see if the queue's device allows for round-robin or it is // fixed to one particular compute CCS (it is so for sub-sub-devices). auto &ComputeQueueGroupInfo = Device->QueueGroup[queue_type::Compute]; - ur_queue_group_t ComputeQueueGroup{ - reinterpret_cast(this), queue_type::Compute}; + ur_queue_group_t ComputeQueueGroup{reinterpret_cast(this), + queue_type::Compute}; ComputeQueueGroup.ZeQueues = ComputeQueues; // Create space to hold immediate commandlists corresponding to the // ZeQueues @@ -1144,8 +1146,8 @@ ur_queue_handle_legacy_t_::ur_queue_handle_legacy_t_( ComputeQueueGroupsByTID.set(ComputeQueueGroup); // Copy group initialization. - ur_queue_group_t CopyQueueGroup{ - reinterpret_cast(this), queue_type::MainCopy}; + ur_queue_group_t CopyQueueGroup{reinterpret_cast(this), + queue_type::MainCopy}; const auto &Range = getRangeOfAllowedCopyEngines((ur_device_handle_t)Device); if (Range.first < 0 || Range.second < 0) { // We are asked not to use copy engines, just do nothing. @@ -1177,12 +1179,10 @@ ur_queue_handle_legacy_t_::ur_queue_handle_legacy_t_( ZeCommandListBatchComputeConfig.startSize(); CopyCommandBatch.QueueBatchSize = ZeCommandListBatchCopyConfig.startSize(); - static const bool useDriverCounterBasedEvents = [Device] { + static const bool useDriverCounterBasedEvents = [] { const char *UrRet = std::getenv("UR_L0_USE_DRIVER_COUNTER_BASED_EVENTS"); if (!UrRet) { - if (Device->isPVC()) - return true; - return false; + return true; } return std::atoi(UrRet) != 0; }(); @@ -1192,7 +1192,7 @@ ur_queue_handle_legacy_t_::ur_queue_handle_legacy_t_( Device->Platform->ZeDriverEventPoolCountingEventsExtensionFound; } -void ur_queue_handle_legacy_t_::adjustBatchSizeForFullBatch(bool IsCopy) { +void ur_queue_handle_t_::adjustBatchSizeForFullBatch(bool IsCopy) { auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch; auto &ZeCommandListBatchConfig = IsCopy ? ZeCommandListBatchCopyConfig : ZeCommandListBatchComputeConfig; @@ -1219,7 +1219,7 @@ void ur_queue_handle_legacy_t_::adjustBatchSizeForFullBatch(bool IsCopy) { } } -void ur_queue_handle_legacy_t_::adjustBatchSizeForPartialBatch(bool IsCopy) { +void ur_queue_handle_t_::adjustBatchSizeForPartialBatch(bool IsCopy) { auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch; auto &ZeCommandListBatchConfig = IsCopy ? ZeCommandListBatchCopyConfig : ZeCommandListBatchComputeConfig; @@ -1245,14 +1245,15 @@ void ur_queue_handle_legacy_t_::adjustBatchSizeForPartialBatch(bool IsCopy) { } } -ur_result_t ur_queue_handle_legacy_t_::executeCommandList( - ur_command_list_ptr_t CommandList, bool IsBlocking, bool OKToBatchCommand) { +ur_result_t +ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList, + bool IsBlocking, bool OKToBatchCommand) { // Do nothing if command list is already closed. if (CommandList->second.IsClosed) return UR_RESULT_SUCCESS; - bool UseCopyEngine = CommandList->second.isCopy( - reinterpret_cast(this)); + bool UseCopyEngine = + CommandList->second.isCopy(reinterpret_cast(this)); // If the current LastCommandEvent is the nullptr, then it means // either that no command has ever been issued to the queue @@ -1359,7 +1360,7 @@ ur_result_t ur_queue_handle_legacy_t_::executeCommandList( // ur_event_handle_t HostVisibleEvent; auto Res = createEventAndAssociateQueue( - reinterpret_cast(this), &HostVisibleEvent, + reinterpret_cast(this), &HostVisibleEvent, UR_EXT_COMMAND_TYPE_USER, CommandList, /* IsInternal */ false, /* IsMultiDevice */ true, /* HostVisible */ true); @@ -1483,12 +1484,12 @@ ur_result_t ur_queue_handle_legacy_t_::executeCommandList( return UR_RESULT_SUCCESS; } -bool ur_queue_handle_legacy_t_::doReuseDiscardedEvents() { +bool ur_queue_handle_t_::doReuseDiscardedEvents() { return ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents(); } -ur_result_t ur_queue_handle_legacy_t_::resetDiscardedEvent( - ur_command_list_ptr_t CommandList) { +ur_result_t +ur_queue_handle_t_::resetDiscardedEvent(ur_command_list_ptr_t CommandList) { if (LastCommandEvent && LastCommandEvent->IsDiscarded) { ZE2UR_CALL(zeCommandListAppendBarrier, (CommandList->first, nullptr, 1, &(LastCommandEvent->ZeEvent))); @@ -1521,8 +1522,7 @@ ur_result_t ur_queue_handle_legacy_t_::resetDiscardedEvent( return UR_RESULT_SUCCESS; } -ur_result_t -ur_queue_handle_legacy_t_::addEventToQueueCache(ur_event_handle_t Event) { +ur_result_t ur_queue_handle_t_::addEventToQueueCache(ur_event_handle_t Event) { if (!Event->IsMultiDevice) { auto EventCachesMap = Event->isHostVisible() ? &EventCachesDeviceMap[0] : &EventCachesDeviceMap[1]; @@ -1538,19 +1538,19 @@ ur_queue_handle_legacy_t_::addEventToQueueCache(ur_event_handle_t Event) { return UR_RESULT_SUCCESS; } -void ur_queue_handle_legacy_t_::active_barriers::add(ur_event_handle_t &Event) { +void ur_queue_handle_t_::active_barriers::add(ur_event_handle_t &Event) { Event->RefCount.increment(); Events.push_back(Event); } -ur_result_t ur_queue_handle_legacy_t_::active_barriers::clear() { +ur_result_t ur_queue_handle_t_::active_barriers::clear() { for (const auto &Event : Events) UR_CALL(urEventReleaseInternal(Event)); Events.clear(); return UR_RESULT_SUCCESS; } -void ur_queue_handle_legacy_t_::clearEndTimeRecordings() { +void ur_queue_handle_t_::clearEndTimeRecordings() { uint64_t ZeTimerResolution = Device->ZeDeviceProperties->timerResolution; const uint64_t TimestampMaxValue = Device->getTimestampMask(); @@ -1577,7 +1577,7 @@ void ur_queue_handle_legacy_t_::clearEndTimeRecordings() { EndTimeRecordings.clear(); } -ur_result_t urQueueReleaseInternal(ur_queue_handle_legacy_t Queue) { +ur_result_t urQueueReleaseInternal(ur_queue_handle_t Queue) { if (!Queue->RefCount.decrementAndTest()) return UR_RESULT_SUCCESS; @@ -1616,33 +1616,33 @@ ur_result_t urQueueReleaseInternal(ur_queue_handle_legacy_t Queue) { return UR_RESULT_SUCCESS; } -bool ur_queue_handle_legacy_t_::isBatchingAllowed(bool IsCopy) const { +bool ur_queue_handle_t_::isBatchingAllowed(bool IsCopy) const { auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch; return (CommandBatch.QueueBatchSize > 0 && ((UrL0Serialize & UrL0SerializeBlock) == 0)); } -bool ur_queue_handle_legacy_t_::isDiscardEvents() const { +bool ur_queue_handle_t_::isDiscardEvents() const { return ((this->Properties & UR_QUEUE_FLAG_DISCARD_EVENTS) != 0); } -bool ur_queue_handle_legacy_t_::isPriorityLow() const { +bool ur_queue_handle_t_::isPriorityLow() const { return ((this->Properties & UR_QUEUE_FLAG_PRIORITY_LOW) != 0); } -bool ur_queue_handle_legacy_t_::isPriorityHigh() const { +bool ur_queue_handle_t_::isPriorityHigh() const { return ((this->Properties & UR_QUEUE_FLAG_PRIORITY_HIGH) != 0); } -bool ur_queue_handle_legacy_t_::isBatchedSubmission() const { +bool ur_queue_handle_t_::isBatchedSubmission() const { return ((this->Properties & UR_QUEUE_FLAG_SUBMISSION_BATCHED) != 0); } -bool ur_queue_handle_legacy_t_::isImmediateSubmission() const { +bool ur_queue_handle_t_::isImmediateSubmission() const { return ((this->Properties & UR_QUEUE_FLAG_SUBMISSION_IMMEDIATE) != 0); } -bool ur_queue_handle_legacy_t_::isInOrderQueue() const { +bool ur_queue_handle_t_::isInOrderQueue() const { // If out-of-order queue property is not set, then this is a in-order queue. return ((this->Properties & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) == 0); @@ -1672,11 +1672,11 @@ ur_result_t CleanupEventListFromResetCmdList( // TODO: Event release in immediate commandlist mode is driven by the SYCL // runtime. Need to investigate whether relase can be done earlier, at sync // points such as this, to reduce total number of active Events. -ur_result_t ur_queue_handle_legacy_t_::synchronize() { +ur_result_t ur_queue_handle_t_::synchronize() { if (!Healthy) return UR_RESULT_SUCCESS; - auto syncImmCmdList = [](ur_queue_handle_legacy_t_ *Queue, + auto syncImmCmdList = [](ur_queue_handle_t_ *Queue, ur_command_list_ptr_t ImmCmdList) { if (ImmCmdList == Queue->CommandListMap.end()) return UR_RESULT_SUCCESS; @@ -1767,9 +1767,8 @@ ur_result_t ur_queue_handle_legacy_t_::synchronize() { return UR_RESULT_SUCCESS; } -ur_event_handle_t -ur_queue_handle_legacy_t_::getEventFromQueueCache(bool IsMultiDevice, - bool HostVisible) { +ur_event_handle_t ur_queue_handle_t_::getEventFromQueueCache(bool IsMultiDevice, + bool HostVisible) { std::list *Cache; if (!IsMultiDevice) { @@ -1801,7 +1800,7 @@ ur_queue_handle_legacy_t_::getEventFromQueueCache(bool IsMultiDevice, // at the end of a command list batch. This will only be true if the event does // not have dependencies or the dependencies are not for events which exist in // this batch. -bool eventCanBeBatched(ur_queue_handle_legacy_t Queue, bool UseCopyEngine, +bool eventCanBeBatched(ur_queue_handle_t Queue, bool UseCopyEngine, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList) { auto &CommandBatch = @@ -1831,7 +1830,7 @@ bool eventCanBeBatched(ur_queue_handle_legacy_t Queue, bool UseCopyEngine, // dependencies, then this command can be enqueued without a signal event set in // a command list batch. The signal event will be appended at the end of the // batch to be signalled at the end of the command list. -ur_result_t setSignalEvent(ur_queue_handle_legacy_t Queue, bool UseCopyEngine, +ur_result_t setSignalEvent(ur_queue_handle_t Queue, bool UseCopyEngine, ze_event_handle_t *ZeEvent, ur_event_handle_t *Event, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, @@ -1862,7 +1861,7 @@ ur_result_t setSignalEvent(ur_queue_handle_legacy_t Queue, bool UseCopyEngine, // visible pool. // \param HostVisible tells if the event must be created in the // host-visible pool. If not set then this function will decide. -ur_result_t createEventAndAssociateQueue(ur_queue_handle_legacy_t Queue, +ur_result_t createEventAndAssociateQueue(ur_queue_handle_t Queue, ur_event_handle_t *Event, ur_command_t CommandType, ur_command_list_ptr_t CommandList, @@ -1918,12 +1917,12 @@ ur_result_t createEventAndAssociateQueue(ur_queue_handle_legacy_t Queue, // event will not be waited/released by SYCL RT, so it must be destroyed by // EventRelease in resetCommandList. if (!IsInternal) - UR_CALL(urEventRetain(*Event)); + UR_CALL(ur::level_zero::urEventRetain(*Event)); return UR_RESULT_SUCCESS; } -void ur_queue_handle_legacy_t_::CaptureIndirectAccesses() { +void ur_queue_handle_t_::CaptureIndirectAccesses() { for (auto &Kernel : KernelsToBeSubmitted) { if (!Kernel->hasIndirectAccess()) continue; @@ -1947,8 +1946,7 @@ void ur_queue_handle_legacy_t_::CaptureIndirectAccesses() { KernelsToBeSubmitted.clear(); } -ur_result_t -ur_queue_handle_legacy_t_::signalEventFromCmdListIfLastEventDiscarded( +ur_result_t ur_queue_handle_t_::signalEventFromCmdListIfLastEventDiscarded( ur_command_list_ptr_t CommandList) { // We signal new event at the end of command list only if we have queue with // discard_events property and the last command event is discarded. @@ -1962,7 +1960,7 @@ ur_queue_handle_legacy_t_::signalEventFromCmdListIfLastEventDiscarded( // from the host. ur_event_handle_t Event; UR_CALL(createEventAndAssociateQueue( - reinterpret_cast(this), &Event, + reinterpret_cast(this), &Event, UR_EXT_COMMAND_TYPE_USER, CommandList, /* IsInternal */ false, /* IsMultiDevice */ true, /* HostVisible */ false)); @@ -1974,7 +1972,7 @@ ur_queue_handle_legacy_t_::signalEventFromCmdListIfLastEventDiscarded( return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_handle_legacy_t_::executeOpenCommandList(bool IsCopy) { +ur_result_t ur_queue_handle_t_::executeOpenCommandList(bool IsCopy) { auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch; // If there are any commands still in the open command list for this // queue, then close and execute that command list now. @@ -1988,7 +1986,7 @@ ur_result_t ur_queue_handle_legacy_t_::executeOpenCommandList(bool IsCopy) { return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_handle_legacy_t_::resetCommandList( +ur_result_t ur_queue_handle_t_::resetCommandList( ur_command_list_ptr_t CommandList, bool MakeAvailable, std::vector &EventListToCleanup, bool CheckStatus) { bool UseCopyEngine = CommandList->second.isCopy(this); @@ -2090,7 +2088,7 @@ ur_result_t ur_queue_handle_legacy_t_::resetCommandList( return UR_RESULT_SUCCESS; } -bool ur_command_list_info_t::isCopy(ur_queue_handle_legacy_t Queue) const { +bool ur_command_list_info_t::isCopy(ur_queue_handle_t Queue) const { return ZeQueueDesc.ordinal != (uint32_t)Queue->Device ->QueueGroup @@ -2106,7 +2104,7 @@ void ur_command_list_info_t::append(ur_event_handle_t Event) { } ur_command_list_ptr_t -ur_queue_handle_legacy_t_::eventOpenCommandList(ur_event_handle_t Event) { +ur_queue_handle_t_::eventOpenCommandList(ur_event_handle_t Event) { using IsCopy = bool; if (UsingImmCmdLists) { @@ -2131,15 +2129,32 @@ ur_queue_handle_legacy_t_::eventOpenCommandList(ur_event_handle_t Event) { return CommandListMap.end(); } -ur_queue_handle_legacy_t_::ur_queue_group_t & -ur_queue_handle_legacy_t_::getQueueGroup(bool UseCopyEngine) { +void ur_queue_handle_t_::ur_queue_group_t::setImmCmdList( + ur_queue_handle_t queue, ze_command_list_handle_t ZeCommandList) { + // An immediate command list was given to us but we don't have the queue + // descriptor information. Create a dummy and note that it is not recycleable. + ZeStruct ZeQueueDesc; + + ImmCmdLists = std::vector( + 1, + Queue->CommandListMap + .insert(std::pair{ + ZeCommandList, + ur_command_list_info_t(nullptr, true, false, nullptr, ZeQueueDesc, + queue->useCompletionBatching(), false, + false, true)}) + .first); +} + +ur_queue_handle_t_::ur_queue_group_t & +ur_queue_handle_t_::getQueueGroup(bool UseCopyEngine) { auto &Map = (UseCopyEngine ? CopyQueueGroupsByTID : ComputeQueueGroupsByTID); return Map.get(); } // Return the index of the next queue to use based on a // round robin strategy and the queue group ordinal. -uint32_t ur_queue_handle_legacy_t_::ur_queue_group_t::getQueueIndex( +uint32_t ur_queue_handle_t_::ur_queue_group_t::getQueueIndex( uint32_t *QueueGroupOrdinal, uint32_t *QueueIndex, bool QueryOnly) { auto CurrentIndex = NextIndex; @@ -2173,8 +2188,7 @@ uint32_t ur_queue_handle_legacy_t_::ur_queue_group_t::getQueueIndex( // This function will return one of possibly multiple available native // queues and the value of the queue group ordinal. ze_command_queue_handle_t & -ur_queue_handle_legacy_t_::ur_queue_group_t::getZeQueue( - uint32_t *QueueGroupOrdinal) { +ur_queue_handle_t_::ur_queue_group_t::getZeQueue(uint32_t *QueueGroupOrdinal) { // QueueIndex is the proper L0 index. // Index is the plugins concept of index, with main and link copy engines in @@ -2219,7 +2233,7 @@ ur_queue_handle_legacy_t_::ur_queue_group_t::getZeQueue( return ZeQueue; } -int32_t ur_queue_handle_legacy_t_::ur_queue_group_t::getCmdQueueOrdinal( +int32_t ur_queue_handle_t_::ur_queue_group_t::getCmdQueueOrdinal( ze_command_queue_handle_t CmdQueue) { // Find out the right queue group ordinal (first queue might be "main" or // "link") @@ -2231,7 +2245,7 @@ int32_t ur_queue_handle_legacy_t_::ur_queue_group_t::getCmdQueueOrdinal( return Queue->Device->QueueGroup[QueueType].ZeOrdinal; } -bool ur_queue_handle_legacy_t_::useCompletionBatching() { +bool ur_queue_handle_t_::useCompletionBatching() { static bool enabled = getenv_tobool( "UR_L0_IMMEDIATE_COMMANDLISTS_BATCH_EVENT_COMPLETIONS", false); return enabled && !isInOrderQueue() && UsingImmCmdLists; @@ -2241,7 +2255,7 @@ bool ur_queue_handle_legacy_t_::useCompletionBatching() { // fence tracking its completion. This command list & fence are added to the // map of command lists in this queue with ZeFenceInUse = false. // The caller must hold a lock of the queue already. -ur_result_t ur_queue_handle_legacy_t_::createCommandList( +ur_result_t ur_queue_handle_t_::createCommandList( bool UseCopyEngine, ur_command_list_ptr_t &CommandList, ze_command_queue_handle_t *ForcedCmdQueue) { @@ -2284,8 +2298,8 @@ ur_result_t ur_queue_handle_legacy_t_::createCommandList( } ur_result_t -ur_queue_handle_legacy_t_::insertActiveBarriers(ur_command_list_ptr_t &CmdList, - bool UseCopyEngine) { +ur_queue_handle_t_::insertActiveBarriers(ur_command_list_ptr_t &CmdList, + bool UseCopyEngine) { // Early exit if there are no active barriers. if (ActiveBarriers.empty()) return UR_RESULT_SUCCESS; @@ -2294,7 +2308,7 @@ ur_queue_handle_legacy_t_::insertActiveBarriers(ur_command_list_ptr_t &CmdList, _ur_ze_event_list_t ActiveBarriersWaitList; UR_CALL(ActiveBarriersWaitList.createAndRetainUrZeEventList( ActiveBarriers.vector().size(), ActiveBarriers.vector().data(), - reinterpret_cast(this), UseCopyEngine)); + reinterpret_cast(this), UseCopyEngine)); // We can now replace active barriers with the ones in the wait list. UR_CALL(ActiveBarriers.clear()); @@ -2310,7 +2324,7 @@ ur_queue_handle_legacy_t_::insertActiveBarriers(ur_command_list_ptr_t &CmdList, ur_event_handle_t Event = nullptr; if (auto Res = createEventAndAssociateQueue( - reinterpret_cast(this), &Event, + reinterpret_cast(this), &Event, UR_EXT_COMMAND_TYPE_USER, CmdList, /* IsInternal */ true, /* IsMultiDevice */ true)) return Res; @@ -2326,7 +2340,7 @@ ur_queue_handle_legacy_t_::insertActiveBarriers(ur_command_list_ptr_t &CmdList, return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_handle_legacy_t_::insertStartBarrierIfDiscardEventsMode( +ur_result_t ur_queue_handle_t_::insertStartBarrierIfDiscardEventsMode( ur_command_list_ptr_t &CmdList) { // If current command list is different from the last command list then insert // a barrier waiting for the last command event. @@ -2352,7 +2366,7 @@ static const bool UseCopyEngineForInOrderQueue = [] { (std::stoi(CopyEngineForInOrderQueue) != 0)); }(); -bool ur_queue_handle_legacy_t_::useCopyEngine(bool PreferCopyEngine) const { +bool ur_queue_handle_t_::useCopyEngine(bool PreferCopyEngine) const { auto InitialCopyGroup = CopyQueueGroupsByTID.begin()->second; return PreferCopyEngine && InitialCopyGroup.ZeQueues.size() > 0 && (!isInOrderQueue() || UseCopyEngineForInOrderQueue); @@ -2360,8 +2374,7 @@ bool ur_queue_handle_legacy_t_::useCopyEngine(bool PreferCopyEngine) const { // This function will return one of po6ssibly multiple available // immediate commandlists associated with this Queue. -ur_command_list_ptr_t & -ur_queue_handle_legacy_t_::ur_queue_group_t::getImmCmdList() { +ur_command_list_ptr_t &ur_queue_handle_t_::ur_queue_group_t::getImmCmdList() { uint32_t QueueIndex, QueueOrdinal; auto Index = getQueueIndex(&QueueOrdinal, &QueueIndex); @@ -2373,6 +2386,7 @@ ur_queue_handle_legacy_t_::ur_queue_group_t::getImmCmdList() { ZeCommandQueueDesc.ordinal = QueueOrdinal; ZeCommandQueueDesc.index = QueueIndex; ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS; + bool isInOrderList = false; const char *Priority = "Normal"; if (Queue->isPriorityLow()) { ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW; @@ -2388,6 +2402,7 @@ ur_queue_handle_legacy_t_::ur_queue_group_t::getImmCmdList() { } if (Queue->Device->useDriverInOrderLists() && Queue->isInOrderQueue()) { + isInOrderList = true; ZeCommandQueueDesc.flags |= ZE_COMMAND_QUEUE_FLAG_IN_ORDER; } @@ -2436,7 +2451,7 @@ ur_queue_handle_legacy_t_::ur_queue_group_t::getImmCmdList() { ZeCommandList, ur_command_list_info_t( nullptr, true, false, nullptr, ZeCommandQueueDesc, - Queue->useCompletionBatching(), true, false, true)}) + Queue->useCompletionBatching(), true, isInOrderList, true)}) .first; return ImmCmdLists[Index]; @@ -2465,7 +2480,7 @@ static const size_t ImmCmdListsEventCleanupThreshold = [] { return Threshold; }(); -size_t ur_queue_handle_legacy_t_::getImmdCmmdListsEventCleanupThreshold() { +size_t ur_queue_handle_t_::getImmdCmmdListsEventCleanupThreshold() { return useCompletionBatching() ? CompletionEventsPerBatch : ImmCmdListsEventCleanupThreshold; } diff --git a/source/adapters/level_zero/queue.hpp b/source/adapters/level_zero/queue.hpp index 6e2444d2fa..699d7ec960 100644 --- a/source/adapters/level_zero/queue.hpp +++ b/source/adapters/level_zero/queue.hpp @@ -20,19 +20,15 @@ #include #include -#include +#include #include #include #include "common.hpp" #include "device.hpp" -#include "queue_api.hpp" - -struct ur_queue_handle_legacy_t_; -using ur_queue_handle_legacy_t = ur_queue_handle_legacy_t_ *; extern "C" { -ur_result_t urQueueReleaseInternal(ur_queue_handle_legacy_t Queue); +ur_result_t urQueueReleaseInternal(ur_queue_handle_t Queue); } // extern "C" struct ur_completion_batch; @@ -74,8 +70,7 @@ struct ur_completion_batch { // Seals the event batch and appends a barrier to the command list. // Adding any further events after this, but before reset, is undefined. - ur_result_t seal(ur_queue_handle_legacy_t queue, - ze_command_list_handle_t cmdlist); + ur_result_t seal(ur_queue_handle_t queue, ze_command_list_handle_t cmdlist); // Resets a complete batch back to an empty state. Cleanups internal state // but keeps allocated resources for reuse. @@ -117,7 +112,7 @@ struct ur_completion_batches { // returned to indicate that there are no batches available. // This is safe, but will increase how many events are associated // with the active batch. - ur_result_t tryCleanup(ur_queue_handle_legacy_t queue, + ur_result_t tryCleanup(ur_queue_handle_t queue, ze_command_list_handle_t cmdlist, std::vector &EventList, std::vector &EventListToCleanup); @@ -154,10 +149,10 @@ struct ur_completion_batches { ur_completion_batch_it active; }; -ur_result_t resetCommandLists(ur_queue_handle_legacy_t Queue); +ur_result_t resetCommandLists(ur_queue_handle_t Queue); ur_result_t -CleanupEventsInImmCmdLists(ur_queue_handle_legacy_t UrQueue, - bool QueueLocked = false, bool QueueSynced = false, +CleanupEventsInImmCmdLists(ur_queue_handle_t UrQueue, bool QueueLocked = false, + bool QueueSynced = false, ur_event_handle_t CompletedEvent = nullptr); // Structure describing the specific use of a command-list in a queue. @@ -208,7 +203,7 @@ struct ur_command_list_info_t { bool IsImmediate; // Helper functions to tell if this is a copy command-list. - bool isCopy(ur_queue_handle_legacy_t Queue) const; + bool isCopy(ur_queue_handle_t Queue) const; // An optional event completion batching mechanism for out-of-order immediate // command lists. @@ -230,209 +225,23 @@ using ur_command_list_map_t = // The iterator pointing to a specific command-list in use. using ur_command_list_ptr_t = ur_command_list_map_t::iterator; -struct ur_queue_handle_legacy_t_ : _ur_object, public ur_queue_handle_t_ { - ur_queue_handle_legacy_t_( - std::vector &ComputeQueues, - std::vector &CopyQueues, - ur_context_handle_t Context, ur_device_handle_t Device, - bool OwnZeCommandQueue, ur_queue_flags_t Properties = 0, - int ForceComputeIndex = -1); - - ur_result_t queueGetInfo(ur_queue_info_t propName, size_t propSize, - void *pPropValue, size_t *pPropSizeRet) override; - ur_result_t queueRetain() override; - ur_result_t queueRelease() override; - ur_result_t queueGetNativeHandle(ur_queue_native_desc_t *pDesc, - ur_native_handle_t *phNativeQueue) override; - ur_result_t queueFinish() override; - ur_result_t queueFlush() override; - ur_result_t enqueueKernelLaunch(ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, - const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueEventsWait(uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t - enqueueEventsWaitWithBarrier(uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueMemBufferRead(ur_mem_handle_t hBuffer, bool blockingRead, - size_t offset, size_t size, void *pDst, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueMemBufferWrite(ur_mem_handle_t hBuffer, bool blockingWrite, - size_t offset, size_t size, - const void *pSrc, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueMemBufferReadRect( - ur_mem_handle_t hBuffer, bool blockingRead, ur_rect_offset_t bufferOrigin, - ur_rect_offset_t hostOrigin, ur_rect_region_t region, - size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, - size_t hostSlicePitch, void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueMemBufferWriteRect( - ur_mem_handle_t hBuffer, bool blockingWrite, - ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, - ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, - size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueMemBufferCopy(ur_mem_handle_t hBufferSrc, - ur_mem_handle_t hBufferDst, size_t srcOffset, - size_t dstOffset, size_t size, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueMemBufferCopyRect( - ur_mem_handle_t hBufferSrc, ur_mem_handle_t hBufferDst, - ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, - ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, - size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueMemBufferFill(ur_mem_handle_t hBuffer, - const void *pPattern, size_t patternSize, - size_t offset, size_t size, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueMemImageRead(ur_mem_handle_t hImage, bool blockingRead, - ur_rect_offset_t origin, - ur_rect_region_t region, size_t rowPitch, - size_t slicePitch, void *pDst, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueMemImageWrite(ur_mem_handle_t hImage, bool blockingWrite, - ur_rect_offset_t origin, - ur_rect_region_t region, size_t rowPitch, - size_t slicePitch, void *pSrc, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t - enqueueMemImageCopy(ur_mem_handle_t hImageSrc, ur_mem_handle_t hImageDst, - ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, - ur_rect_region_t region, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueMemBufferMap(ur_mem_handle_t hBuffer, bool blockingMap, - ur_map_flags_t mapFlags, size_t offset, - size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent, - void **ppRetMap) override; - ur_result_t enqueueMemUnmap(ur_mem_handle_t hMem, void *pMappedPtr, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueUSMFill(void *pMem, size_t patternSize, - const void *pPattern, size_t size, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueUSMMemcpy(bool blocking, void *pDst, const void *pSrc, - size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueUSMFill2D(void *, size_t, size_t, const void *, size_t, - size_t, uint32_t, const ur_event_handle_t *, - ur_event_handle_t *) override; - ur_result_t enqueueUSMMemcpy2D(bool, void *, size_t, const void *, size_t, - size_t, size_t, uint32_t, - const ur_event_handle_t *, - ur_event_handle_t *) override; - ur_result_t enqueueUSMPrefetch(const void *pMem, size_t size, - ur_usm_migration_flags_t flags, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueUSMAdvise(const void *pMem, size_t size, - ur_usm_advice_flags_t advice, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueDeviceGlobalVariableWrite( - ur_program_handle_t hProgram, const char *name, bool blockingWrite, - size_t count, size_t offset, const void *pSrc, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueDeviceGlobalVariableRead( - ur_program_handle_t hProgram, const char *name, bool blockingRead, - size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueReadHostPipe(ur_program_handle_t hProgram, - const char *pipe_symbol, bool blocking, - void *pDst, size_t size, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueWriteHostPipe(ur_program_handle_t hProgram, - const char *pipe_symbol, bool blocking, - void *pSrc, size_t size, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t bindlessImagesImageCopyExp( - const void *pSrc, void *pDst, const ur_image_desc_t *pSrcImageDesc, - const ur_image_desc_t *pDstImageDesc, - const ur_image_format_t *pSrcImageFormat, - const ur_image_format_t *pDstImageFormat, - ur_exp_image_copy_region_t *pCopyRegion, - ur_exp_image_copy_flags_t imageCopyFlags, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t bindlessImagesWaitExternalSemaphoreExp( - ur_exp_interop_semaphore_handle_t hSemaphore, bool hasWaitValue, - uint64_t waitValue, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t bindlessImagesSignalExternalSemaphoreExp( - ur_exp_interop_semaphore_handle_t hSemaphore, bool hasSignalValue, - uint64_t signalValue, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueCooperativeKernelLaunchExp( - ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t - enqueueTimestampRecordingExp(bool blocking, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueKernelLaunchCustomExp( - ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, - uint32_t numPropsInLaunchPropList, - const ur_exp_launch_property_t *launchPropList, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t - enqueueNativeCommandExp(ur_exp_enqueue_native_command_function_t, void *, - uint32_t, const ur_mem_handle_t *, - const ur_exp_enqueue_native_command_properties_t *, - uint32_t, const ur_event_handle_t *, - ur_event_handle_t *) override; +struct ur_queue_handle_t_ : _ur_object { + ur_queue_handle_t_(std::vector &ComputeQueues, + std::vector &CopyQueues, + ur_context_handle_t Context, ur_device_handle_t Device, + bool OwnZeCommandQueue, ur_queue_flags_t Properties = 0, + int ForceComputeIndex = -1); using queue_type = ur_device_handle_t_::queue_group_info_t::type; // PI queue is in general a one to many mapping to L0 native queues. struct ur_queue_group_t { - ur_queue_handle_legacy_t Queue; + ur_queue_handle_t Queue; ur_queue_group_t() = delete; // The Queue argument captures the enclosing PI queue. // The Type argument specifies the type of this queue group. // The actual ZeQueues are populated at PI queue construction. - ur_queue_group_t(ur_queue_handle_legacy_t Queue, queue_type Type) + ur_queue_group_t(ur_queue_handle_t Queue, queue_type Type) : Queue(Queue), Type(Type) {} // The type of the queue group. @@ -462,8 +271,7 @@ struct ur_queue_handle_legacy_t_ : _ur_object, public ur_queue_handle_t_ { ze_command_queue_handle_t &getZeQueue(uint32_t *QueueGroupOrdinal); // This function sets an immediate commandlist from the interop interface. - void setImmCmdList(ur_queue_handle_legacy_t queue, - ze_command_list_handle_t); + void setImmCmdList(ur_queue_handle_t queue, ze_command_list_handle_t); // This function returns the next immediate commandlist to use. ur_command_list_ptr_t &getImmCmdList(); @@ -530,15 +338,15 @@ struct ur_queue_handle_legacy_t_ : _ur_object, public ur_queue_handle_t_ { pi_queue_group_by_tid_t CopyQueueGroupsByTID; // Keeps the PI context to which this queue belongs. - // This field is only set at ur_queue_handle_legacy_t_ creation time, and + // This field is only set at ur_queue_handle_t_ creation time, and // cannot change. Therefore it can be accessed without holding a lock on this - // ur_queue_handle_legacy_t_. + // ur_queue_handle_t_. const ur_context_handle_t Context; // Keeps the PI device to which this queue belongs. - // This field is only set at ur_queue_handle_legacy_t_ creation time, and + // This field is only set at ur_queue_handle_t_ creation time, and // cannot change. Therefore it can be accessed without holding a lock on this - // ur_queue_handle_legacy_t_. + // ur_queue_handle_t_. const ur_device_handle_t Device; // A queue may use either standard or immediate commandlists. At queue @@ -881,21 +689,10 @@ struct ur_queue_handle_legacy_t_ : _ur_object, public ur_queue_handle_t_ { // Threshold for cleaning up the EventList for immediate command lists. size_t getImmdCmmdListsEventCleanupThreshold(); -}; - -template QueueT GetQueue(ur_queue_handle_t Queue) { - if (!Queue) - return nullptr; - auto *Q = dynamic_cast(Queue); - if (!Q) { - throw UR_RESULT_ERROR_INVALID_QUEUE; - } - return Q; -} -static inline ur_queue_handle_legacy_t Legacy(ur_queue_handle_t Queue) { - return GetQueue(Queue); -} + // Pointer to the unified handle. + ur_queue_handle_t_ *UnifiedHandle; +}; // This helper function creates a ur_event_handle_t and associate a // ur_queue_handle_t. Note that the caller of this function must have acquired @@ -910,18 +707,16 @@ static inline ur_queue_handle_legacy_t Legacy(ur_queue_handle_t Queue) { // multiple devices. // \param ForceHostVisible tells if the event must be created in // the host-visible pool -ur_result_t -createEventAndAssociateQueue(ur_queue_handle_legacy_t Queue, - ur_event_handle_t *Event, ur_command_t CommandType, - ur_command_list_ptr_t CommandList, bool IsInternal, - bool IsMultiDevice, - std::optional HostVisible = std::nullopt); +ur_result_t createEventAndAssociateQueue( + ur_queue_handle_t Queue, ur_event_handle_t *Event, ur_command_t CommandType, + ur_command_list_ptr_t CommandList, bool IsInternal, bool IsMultiDevice, + std::optional HostVisible = std::nullopt); // This helper function checks to see if an event for a command can be included // at the end of a command list batch. This will only be true if the event does // not have dependencies or the dependencies are not for events which exist in // this batch. -bool eventCanBeBatched(ur_queue_handle_legacy_t Queue, bool UseCopyEngine, +bool eventCanBeBatched(ur_queue_handle_t Queue, bool UseCopyEngine, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList); @@ -930,7 +725,7 @@ bool eventCanBeBatched(ur_queue_handle_legacy_t Queue, bool UseCopyEngine, // dependencies, then this command can be enqueued without a signal event set in // a command list batch. The signal event will be appended at the end of the // batch to be signalled at the end of the command list. -ur_result_t setSignalEvent(ur_queue_handle_legacy_t Queue, bool UseCopyEngine, +ur_result_t setSignalEvent(ur_queue_handle_t Queue, bool UseCopyEngine, ze_event_handle_t *ZeEvent, ur_event_handle_t *Event, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, diff --git a/source/adapters/level_zero/sampler.cpp b/source/adapters/level_zero/sampler.cpp index 54ca1b6672..d48e6aeede 100644 --- a/source/adapters/level_zero/sampler.cpp +++ b/source/adapters/level_zero/sampler.cpp @@ -12,7 +12,9 @@ #include "logger/ur_logger.hpp" #include "ur_level_zero.hpp" -UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreate( +namespace ur::level_zero { + +ur_result_t urSamplerCreate( ur_context_handle_t Context, ///< [in] handle of the context object const ur_sampler_desc_t *Props, ///< [in] specifies a list of sampler property names and their @@ -109,17 +111,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreate( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urSamplerRetain( - ur_sampler_handle_t - Sampler ///< [in] handle of the sampler object to get access +ur_result_t +urSamplerRetain(ur_sampler_handle_t + Sampler ///< [in] handle of the sampler object to get access ) { Sampler->RefCount.increment(); return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urSamplerRelease( - ur_sampler_handle_t - Sampler ///< [in] handle of the sampler object to release +ur_result_t +urSamplerRelease(ur_sampler_handle_t + Sampler ///< [in] handle of the sampler object to release ) { if (!Sampler->RefCount.decrementAndTest()) return UR_RESULT_SUCCESS; @@ -133,7 +135,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urSamplerRelease( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urSamplerGetInfo( +ur_result_t urSamplerGetInfo( ur_sampler_handle_t Sampler, ///< [in] handle of the sampler object ur_sampler_info_t PropName, ///< [in] name of the sampler property to query size_t PropValueSize, ///< [in] size in bytes of the sampler property value @@ -152,7 +154,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urSamplerGetInfo( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -UR_APIEXPORT ur_result_t UR_APICALL urSamplerGetNativeHandle( +ur_result_t urSamplerGetNativeHandle( ur_sampler_handle_t Sampler, ///< [in] handle of the sampler. ur_native_handle_t *NativeSampler ///< [out] a pointer to the native ///< handle of the sampler. @@ -164,7 +166,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urSamplerGetNativeHandle( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreateWithNativeHandle( +ur_result_t urSamplerCreateWithNativeHandle( ur_native_handle_t NativeSampler, ///< [in] the native handle of the sampler. ur_context_handle_t Context, ///< [in] handle of the context object @@ -182,3 +184,4 @@ UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreateWithNativeHandle( "{} function not implemented!", __FUNCTION__); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +} // namespace ur::level_zero diff --git a/source/adapters/level_zero/ur_interface_loader.cpp b/source/adapters/level_zero/ur_interface_loader.cpp index fe3ccdf22a..df4bd5b640 100644 --- a/source/adapters/level_zero/ur_interface_loader.cpp +++ b/source/adapters/level_zero/ur_interface_loader.cpp @@ -1,19 +1,19 @@ -//===--------- ur_interface_loader.cpp - Level Zero Adapter----------------===// +//===--------- ur_interface_loader.cpp - Level Zero Adapter ------------===// // -// Copyright (C) 2023 Intel Corporation +// Copyright (C) 2024 Intel Corporation // // Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM // Exceptions. See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// - #include #include -namespace { +#include "ur_interface_loader.hpp" -ur_result_t validateProcInputs(ur_api_version_t version, void *pDdiTable) { +static ur_result_t validateProcInputs(ur_api_version_t version, + void *pDdiTable) { if (nullptr == pDdiTable) { return UR_RESULT_ERROR_INVALID_NULL_POINTER; } @@ -24,472 +24,613 @@ ur_result_t validateProcInputs(ur_api_version_t version, void *pDdiTable) { } return UR_RESULT_SUCCESS; } -} // namespace -#if defined(__cplusplus) +#ifdef UR_STATIC_ADAPTER_LEVEL_ZERO +namespace ur::level_zero { +#elif defined(__cplusplus) extern "C" { #endif -UR_DLLEXPORT ur_result_t UR_APICALL urGetGlobalProcAddrTable( - ur_api_version_t version, ///< [in] API version requested - ur_global_dditable_t - *pDdiTable ///< [in,out] pointer to table of DDI function pointers -) { - auto retVal = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != retVal) { - return retVal; +UR_APIEXPORT ur_result_t UR_APICALL urGetGlobalProcAddrTable( + ur_api_version_t version, ur_global_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; } - pDdiTable->pfnAdapterGet = urAdapterGet; - pDdiTable->pfnAdapterRelease = urAdapterRelease; - pDdiTable->pfnAdapterRetain = urAdapterRetain; - pDdiTable->pfnAdapterGetLastError = urAdapterGetLastError; - pDdiTable->pfnAdapterGetInfo = urAdapterGetInfo; - return retVal; + pDdiTable->pfnAdapterGet = ur::level_zero::urAdapterGet; + pDdiTable->pfnAdapterRelease = ur::level_zero::urAdapterRelease; + pDdiTable->pfnAdapterRetain = ur::level_zero::urAdapterRetain; + pDdiTable->pfnAdapterGetLastError = ur::level_zero::urAdapterGetLastError; + pDdiTable->pfnAdapterGetInfo = ur::level_zero::urAdapterGetInfo; + + return result; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetContextProcAddrTable( - ur_api_version_t version, ///< [in] API version requested - ur_context_dditable_t - *pDdiTable ///< [in,out] pointer to table of DDI function pointers -) { - auto retVal = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != retVal) { - return retVal; +UR_APIEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable( + ur_api_version_t version, ur_bindless_images_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; } - pDdiTable->pfnCreate = urContextCreate; - pDdiTable->pfnRetain = urContextRetain; - pDdiTable->pfnRelease = urContextRelease; - pDdiTable->pfnGetInfo = urContextGetInfo; - pDdiTable->pfnGetNativeHandle = urContextGetNativeHandle; - pDdiTable->pfnCreateWithNativeHandle = urContextCreateWithNativeHandle; - pDdiTable->pfnSetExtendedDeleter = urContextSetExtendedDeleter; + pDdiTable->pfnUnsampledImageHandleDestroyExp = + ur::level_zero::urBindlessImagesUnsampledImageHandleDestroyExp; + pDdiTable->pfnSampledImageHandleDestroyExp = + ur::level_zero::urBindlessImagesSampledImageHandleDestroyExp; + pDdiTable->pfnImageAllocateExp = + ur::level_zero::urBindlessImagesImageAllocateExp; + pDdiTable->pfnImageFreeExp = ur::level_zero::urBindlessImagesImageFreeExp; + pDdiTable->pfnUnsampledImageCreateExp = + ur::level_zero::urBindlessImagesUnsampledImageCreateExp; + pDdiTable->pfnSampledImageCreateExp = + ur::level_zero::urBindlessImagesSampledImageCreateExp; + pDdiTable->pfnImageCopyExp = ur::level_zero::urBindlessImagesImageCopyExp; + pDdiTable->pfnImageGetInfoExp = + ur::level_zero::urBindlessImagesImageGetInfoExp; + pDdiTable->pfnMipmapGetLevelExp = + ur::level_zero::urBindlessImagesMipmapGetLevelExp; + pDdiTable->pfnMipmapFreeExp = ur::level_zero::urBindlessImagesMipmapFreeExp; + pDdiTable->pfnImportExternalMemoryExp = + ur::level_zero::urBindlessImagesImportExternalMemoryExp; + pDdiTable->pfnMapExternalArrayExp = + ur::level_zero::urBindlessImagesMapExternalArrayExp; + pDdiTable->pfnMapExternalLinearMemoryExp = + ur::level_zero::urBindlessImagesMapExternalLinearMemoryExp; + pDdiTable->pfnReleaseExternalMemoryExp = + ur::level_zero::urBindlessImagesReleaseExternalMemoryExp; + pDdiTable->pfnImportExternalSemaphoreExp = + ur::level_zero::urBindlessImagesImportExternalSemaphoreExp; + pDdiTable->pfnReleaseExternalSemaphoreExp = + ur::level_zero::urBindlessImagesReleaseExternalSemaphoreExp; + pDdiTable->pfnWaitExternalSemaphoreExp = + ur::level_zero::urBindlessImagesWaitExternalSemaphoreExp; + pDdiTable->pfnSignalExternalSemaphoreExp = + ur::level_zero::urBindlessImagesSignalExternalSemaphoreExp; - return retVal; + return result; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( - ur_api_version_t version, ///< [in] API version requested - ur_enqueue_dditable_t - *pDdiTable ///< [in,out] pointer to table of DDI function pointers -) { - auto retVal = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != retVal) { - return retVal; +UR_APIEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( + ur_api_version_t version, ur_command_buffer_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; } - pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch; - pDdiTable->pfnEventsWait = urEnqueueEventsWait; - pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier; - pDdiTable->pfnMemBufferRead = urEnqueueMemBufferRead; - pDdiTable->pfnMemBufferWrite = urEnqueueMemBufferWrite; - pDdiTable->pfnMemBufferReadRect = urEnqueueMemBufferReadRect; - pDdiTable->pfnMemBufferWriteRect = urEnqueueMemBufferWriteRect; - pDdiTable->pfnMemBufferCopy = urEnqueueMemBufferCopy; - pDdiTable->pfnMemBufferCopyRect = urEnqueueMemBufferCopyRect; - pDdiTable->pfnMemBufferFill = urEnqueueMemBufferFill; - pDdiTable->pfnMemImageRead = urEnqueueMemImageRead; - pDdiTable->pfnMemImageWrite = urEnqueueMemImageWrite; - pDdiTable->pfnMemImageCopy = urEnqueueMemImageCopy; - pDdiTable->pfnMemBufferMap = urEnqueueMemBufferMap; - pDdiTable->pfnMemUnmap = urEnqueueMemUnmap; - pDdiTable->pfnUSMFill = urEnqueueUSMFill; - pDdiTable->pfnUSMMemcpy = urEnqueueUSMMemcpy; - pDdiTable->pfnUSMPrefetch = urEnqueueUSMPrefetch; - pDdiTable->pfnUSMAdvise = urEnqueueUSMAdvise; - pDdiTable->pfnUSMFill2D = urEnqueueUSMFill2D; - pDdiTable->pfnUSMMemcpy2D = urEnqueueUSMMemcpy2D; - pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite; - pDdiTable->pfnDeviceGlobalVariableRead = urEnqueueDeviceGlobalVariableRead; - - return retVal; + pDdiTable->pfnCreateExp = ur::level_zero::urCommandBufferCreateExp; + pDdiTable->pfnRetainExp = ur::level_zero::urCommandBufferRetainExp; + pDdiTable->pfnReleaseExp = ur::level_zero::urCommandBufferReleaseExp; + pDdiTable->pfnFinalizeExp = ur::level_zero::urCommandBufferFinalizeExp; + pDdiTable->pfnAppendKernelLaunchExp = + ur::level_zero::urCommandBufferAppendKernelLaunchExp; + pDdiTable->pfnAppendUSMMemcpyExp = + ur::level_zero::urCommandBufferAppendUSMMemcpyExp; + pDdiTable->pfnAppendUSMFillExp = + ur::level_zero::urCommandBufferAppendUSMFillExp; + pDdiTable->pfnAppendMemBufferCopyExp = + ur::level_zero::urCommandBufferAppendMemBufferCopyExp; + pDdiTable->pfnAppendMemBufferWriteExp = + ur::level_zero::urCommandBufferAppendMemBufferWriteExp; + pDdiTable->pfnAppendMemBufferReadExp = + ur::level_zero::urCommandBufferAppendMemBufferReadExp; + pDdiTable->pfnAppendMemBufferCopyRectExp = + ur::level_zero::urCommandBufferAppendMemBufferCopyRectExp; + pDdiTable->pfnAppendMemBufferWriteRectExp = + ur::level_zero::urCommandBufferAppendMemBufferWriteRectExp; + pDdiTable->pfnAppendMemBufferReadRectExp = + ur::level_zero::urCommandBufferAppendMemBufferReadRectExp; + pDdiTable->pfnAppendMemBufferFillExp = + ur::level_zero::urCommandBufferAppendMemBufferFillExp; + pDdiTable->pfnAppendUSMPrefetchExp = + ur::level_zero::urCommandBufferAppendUSMPrefetchExp; + pDdiTable->pfnAppendUSMAdviseExp = + ur::level_zero::urCommandBufferAppendUSMAdviseExp; + pDdiTable->pfnEnqueueExp = ur::level_zero::urCommandBufferEnqueueExp; + pDdiTable->pfnRetainCommandExp = + ur::level_zero::urCommandBufferRetainCommandExp; + pDdiTable->pfnReleaseCommandExp = + ur::level_zero::urCommandBufferReleaseCommandExp; + pDdiTable->pfnUpdateKernelLaunchExp = + ur::level_zero::urCommandBufferUpdateKernelLaunchExp; + pDdiTable->pfnUpdateSignalEventExp = + ur::level_zero::urCommandBufferUpdateSignalEventExp; + pDdiTable->pfnUpdateWaitEventsExp = + ur::level_zero::urCommandBufferUpdateWaitEventsExp; + pDdiTable->pfnGetInfoExp = ur::level_zero::urCommandBufferGetInfoExp; + pDdiTable->pfnCommandGetInfoExp = + ur::level_zero::urCommandBufferCommandGetInfoExp; + + return result; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetEventProcAddrTable( - ur_api_version_t version, ///< [in] API version requested - ur_event_dditable_t - *pDdiTable ///< [in,out] pointer to table of DDI function pointers -) { - auto retVal = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != retVal) { - return retVal; +UR_APIEXPORT ur_result_t UR_APICALL urGetContextProcAddrTable( + ur_api_version_t version, ur_context_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; } - pDdiTable->pfnGetInfo = urEventGetInfo; - pDdiTable->pfnGetProfilingInfo = urEventGetProfilingInfo; - pDdiTable->pfnWait = urEventWait; - pDdiTable->pfnRetain = urEventRetain; - pDdiTable->pfnRelease = urEventRelease; - pDdiTable->pfnGetNativeHandle = urEventGetNativeHandle; - pDdiTable->pfnCreateWithNativeHandle = urEventCreateWithNativeHandle; - pDdiTable->pfnSetCallback = urEventSetCallback; - - return retVal; + + pDdiTable->pfnCreate = ur::level_zero::urContextCreate; + pDdiTable->pfnRetain = ur::level_zero::urContextRetain; + pDdiTable->pfnRelease = ur::level_zero::urContextRelease; + pDdiTable->pfnGetInfo = ur::level_zero::urContextGetInfo; + pDdiTable->pfnGetNativeHandle = ur::level_zero::urContextGetNativeHandle; + pDdiTable->pfnCreateWithNativeHandle = + ur::level_zero::urContextCreateWithNativeHandle; + pDdiTable->pfnSetExtendedDeleter = + ur::level_zero::urContextSetExtendedDeleter; + + return result; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( - ur_api_version_t version, ///< [in] API version requested - ur_kernel_dditable_t - *pDdiTable ///< [in,out] pointer to table of DDI function pointers -) { - auto retVal = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != retVal) { - return retVal; +UR_APIEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( + ur_api_version_t version, ur_enqueue_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; } - pDdiTable->pfnCreate = urKernelCreate; - pDdiTable->pfnGetInfo = urKernelGetInfo; - pDdiTable->pfnGetGroupInfo = urKernelGetGroupInfo; - pDdiTable->pfnGetSubGroupInfo = urKernelGetSubGroupInfo; - pDdiTable->pfnRetain = urKernelRetain; - pDdiTable->pfnRelease = urKernelRelease; - pDdiTable->pfnGetNativeHandle = urKernelGetNativeHandle; - pDdiTable->pfnCreateWithNativeHandle = urKernelCreateWithNativeHandle; - pDdiTable->pfnSetArgValue = urKernelSetArgValue; - pDdiTable->pfnSetArgLocal = urKernelSetArgLocal; - pDdiTable->pfnSetArgPointer = urKernelSetArgPointer; - pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; - pDdiTable->pfnSetArgSampler = urKernelSetArgSampler; - pDdiTable->pfnSetArgMemObj = urKernelSetArgMemObj; - pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; - pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; - return retVal; + + pDdiTable->pfnKernelLaunch = ur::level_zero::urEnqueueKernelLaunch; + pDdiTable->pfnEventsWait = ur::level_zero::urEnqueueEventsWait; + pDdiTable->pfnEventsWaitWithBarrier = + ur::level_zero::urEnqueueEventsWaitWithBarrier; + pDdiTable->pfnMemBufferRead = ur::level_zero::urEnqueueMemBufferRead; + pDdiTable->pfnMemBufferWrite = ur::level_zero::urEnqueueMemBufferWrite; + pDdiTable->pfnMemBufferReadRect = ur::level_zero::urEnqueueMemBufferReadRect; + pDdiTable->pfnMemBufferWriteRect = + ur::level_zero::urEnqueueMemBufferWriteRect; + pDdiTable->pfnMemBufferCopy = ur::level_zero::urEnqueueMemBufferCopy; + pDdiTable->pfnMemBufferCopyRect = ur::level_zero::urEnqueueMemBufferCopyRect; + pDdiTable->pfnMemBufferFill = ur::level_zero::urEnqueueMemBufferFill; + pDdiTable->pfnMemImageRead = ur::level_zero::urEnqueueMemImageRead; + pDdiTable->pfnMemImageWrite = ur::level_zero::urEnqueueMemImageWrite; + pDdiTable->pfnMemImageCopy = ur::level_zero::urEnqueueMemImageCopy; + pDdiTable->pfnMemBufferMap = ur::level_zero::urEnqueueMemBufferMap; + pDdiTable->pfnMemUnmap = ur::level_zero::urEnqueueMemUnmap; + pDdiTable->pfnUSMFill = ur::level_zero::urEnqueueUSMFill; + pDdiTable->pfnUSMMemcpy = ur::level_zero::urEnqueueUSMMemcpy; + pDdiTable->pfnUSMPrefetch = ur::level_zero::urEnqueueUSMPrefetch; + pDdiTable->pfnUSMAdvise = ur::level_zero::urEnqueueUSMAdvise; + pDdiTable->pfnUSMFill2D = ur::level_zero::urEnqueueUSMFill2D; + pDdiTable->pfnUSMMemcpy2D = ur::level_zero::urEnqueueUSMMemcpy2D; + pDdiTable->pfnDeviceGlobalVariableWrite = + ur::level_zero::urEnqueueDeviceGlobalVariableWrite; + pDdiTable->pfnDeviceGlobalVariableRead = + ur::level_zero::urEnqueueDeviceGlobalVariableRead; + pDdiTable->pfnReadHostPipe = ur::level_zero::urEnqueueReadHostPipe; + pDdiTable->pfnWriteHostPipe = ur::level_zero::urEnqueueWriteHostPipe; + + return result; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetMemProcAddrTable( - ur_api_version_t version, ///< [in] API version requested - ur_mem_dditable_t - *pDdiTable ///< [in,out] pointer to table of DDI function pointers -) { - auto retVal = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != retVal) { - return retVal; +UR_APIEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( + ur_api_version_t version, ur_enqueue_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; } - pDdiTable->pfnImageCreate = urMemImageCreate; - pDdiTable->pfnBufferCreate = urMemBufferCreate; - pDdiTable->pfnRetain = urMemRetain; - pDdiTable->pfnRelease = urMemRelease; - pDdiTable->pfnBufferPartition = urMemBufferPartition; - pDdiTable->pfnGetNativeHandle = urMemGetNativeHandle; - pDdiTable->pfnBufferCreateWithNativeHandle = - urMemBufferCreateWithNativeHandle; - pDdiTable->pfnImageCreateWithNativeHandle = urMemImageCreateWithNativeHandle; - pDdiTable->pfnGetInfo = urMemGetInfo; - pDdiTable->pfnImageGetInfo = urMemImageGetInfo; - return retVal; + pDdiTable->pfnKernelLaunchCustomExp = + ur::level_zero::urEnqueueKernelLaunchCustomExp; + pDdiTable->pfnCooperativeKernelLaunchExp = + ur::level_zero::urEnqueueCooperativeKernelLaunchExp; + pDdiTable->pfnTimestampRecordingExp = + ur::level_zero::urEnqueueTimestampRecordingExp; + pDdiTable->pfnNativeCommandExp = ur::level_zero::urEnqueueNativeCommandExp; + + return result; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetPlatformProcAddrTable( - ur_api_version_t version, ///< [in] API version requested - ur_platform_dditable_t - *pDdiTable ///< [in,out] pointer to table of DDI function pointers -) { - auto retVal = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != retVal) { - return retVal; +UR_APIEXPORT ur_result_t UR_APICALL urGetEventProcAddrTable( + ur_api_version_t version, ur_event_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; } - pDdiTable->pfnGet = urPlatformGet; - pDdiTable->pfnGetInfo = urPlatformGetInfo; - pDdiTable->pfnGetNativeHandle = urPlatformGetNativeHandle; - pDdiTable->pfnCreateWithNativeHandle = urPlatformCreateWithNativeHandle; - pDdiTable->pfnGetApiVersion = urPlatformGetApiVersion; - pDdiTable->pfnGetBackendOption = urPlatformGetBackendOption; - - return retVal; -} -UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramProcAddrTable( - ur_api_version_t version, ///< [in] API version requested - ur_program_dditable_t - *pDdiTable ///< [in,out] pointer to table of DDI function pointers -) { + pDdiTable->pfnGetInfo = ur::level_zero::urEventGetInfo; + pDdiTable->pfnGetProfilingInfo = ur::level_zero::urEventGetProfilingInfo; + pDdiTable->pfnWait = ur::level_zero::urEventWait; + pDdiTable->pfnRetain = ur::level_zero::urEventRetain; + pDdiTable->pfnRelease = ur::level_zero::urEventRelease; + pDdiTable->pfnGetNativeHandle = ur::level_zero::urEventGetNativeHandle; + pDdiTable->pfnCreateWithNativeHandle = + ur::level_zero::urEventCreateWithNativeHandle; + pDdiTable->pfnSetCallback = ur::level_zero::urEventSetCallback; + + return result; +} - auto retVal = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != retVal) { - return retVal; +UR_APIEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( + ur_api_version_t version, ur_kernel_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; } - pDdiTable->pfnCreateWithIL = urProgramCreateWithIL; - pDdiTable->pfnCreateWithBinary = urProgramCreateWithBinary; - pDdiTable->pfnBuild = urProgramBuild; - pDdiTable->pfnCompile = urProgramCompile; - pDdiTable->pfnLink = urProgramLink; - pDdiTable->pfnRetain = urProgramRetain; - pDdiTable->pfnRelease = urProgramRelease; - pDdiTable->pfnGetFunctionPointer = urProgramGetFunctionPointer; - pDdiTable->pfnGetGlobalVariablePointer = urProgramGetGlobalVariablePointer; - pDdiTable->pfnGetInfo = urProgramGetInfo; - pDdiTable->pfnGetBuildInfo = urProgramGetBuildInfo; + + pDdiTable->pfnCreate = ur::level_zero::urKernelCreate; + pDdiTable->pfnGetInfo = ur::level_zero::urKernelGetInfo; + pDdiTable->pfnGetGroupInfo = ur::level_zero::urKernelGetGroupInfo; + pDdiTable->pfnGetSubGroupInfo = ur::level_zero::urKernelGetSubGroupInfo; + pDdiTable->pfnRetain = ur::level_zero::urKernelRetain; + pDdiTable->pfnRelease = ur::level_zero::urKernelRelease; + pDdiTable->pfnGetNativeHandle = ur::level_zero::urKernelGetNativeHandle; + pDdiTable->pfnCreateWithNativeHandle = + ur::level_zero::urKernelCreateWithNativeHandle; + pDdiTable->pfnGetSuggestedLocalWorkSize = + ur::level_zero::urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnSetArgValue = ur::level_zero::urKernelSetArgValue; + pDdiTable->pfnSetArgLocal = ur::level_zero::urKernelSetArgLocal; + pDdiTable->pfnSetArgPointer = ur::level_zero::urKernelSetArgPointer; + pDdiTable->pfnSetExecInfo = ur::level_zero::urKernelSetExecInfo; + pDdiTable->pfnSetArgSampler = ur::level_zero::urKernelSetArgSampler; + pDdiTable->pfnSetArgMemObj = ur::level_zero::urKernelSetArgMemObj; pDdiTable->pfnSetSpecializationConstants = - urProgramSetSpecializationConstants; - pDdiTable->pfnGetNativeHandle = urProgramGetNativeHandle; - pDdiTable->pfnCreateWithNativeHandle = urProgramCreateWithNativeHandle; + ur::level_zero::urKernelSetSpecializationConstants; - return retVal; + return result; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( - ur_api_version_t version, ///< [in] API version requested - ur_queue_dditable_t - *pDdiTable ///< [in,out] pointer to table of DDI function pointers -) { - auto retVal = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != retVal) { - return retVal; +UR_APIEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable( + ur_api_version_t version, ur_kernel_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; } - pDdiTable->pfnGetInfo = urQueueGetInfo; - pDdiTable->pfnCreate = urQueueCreate; - pDdiTable->pfnRetain = urQueueRetain; - pDdiTable->pfnRelease = urQueueRelease; - pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle; - pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle; - pDdiTable->pfnFinish = urQueueFinish; - pDdiTable->pfnFlush = urQueueFlush; + pDdiTable->pfnSuggestMaxCooperativeGroupCountExp = + ur::level_zero::urKernelSuggestMaxCooperativeGroupCountExp; - return retVal; + return result; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetSamplerProcAddrTable( - ur_api_version_t version, ///< [in] API version requested - ur_sampler_dditable_t - *pDdiTable ///< [in,out] pointer to table of DDI function pointers -) { - auto retVal = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != retVal) { - return retVal; +UR_APIEXPORT ur_result_t UR_APICALL +urGetMemProcAddrTable(ur_api_version_t version, ur_mem_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; } - pDdiTable->pfnCreate = urSamplerCreate; - pDdiTable->pfnRetain = urSamplerRetain; - pDdiTable->pfnRelease = urSamplerRelease; - pDdiTable->pfnGetInfo = urSamplerGetInfo; - pDdiTable->pfnGetNativeHandle = urSamplerGetNativeHandle; - pDdiTable->pfnCreateWithNativeHandle = urSamplerCreateWithNativeHandle; - - return retVal; + + pDdiTable->pfnImageCreate = ur::level_zero::urMemImageCreate; + pDdiTable->pfnBufferCreate = ur::level_zero::urMemBufferCreate; + pDdiTable->pfnRetain = ur::level_zero::urMemRetain; + pDdiTable->pfnRelease = ur::level_zero::urMemRelease; + pDdiTable->pfnBufferPartition = ur::level_zero::urMemBufferPartition; + pDdiTable->pfnGetNativeHandle = ur::level_zero::urMemGetNativeHandle; + pDdiTable->pfnBufferCreateWithNativeHandle = + ur::level_zero::urMemBufferCreateWithNativeHandle; + pDdiTable->pfnImageCreateWithNativeHandle = + ur::level_zero::urMemImageCreateWithNativeHandle; + pDdiTable->pfnGetInfo = ur::level_zero::urMemGetInfo; + pDdiTable->pfnImageGetInfo = ur::level_zero::urMemImageGetInfo; + + return result; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMProcAddrTable( - ur_api_version_t version, ///< [in] API version requested - ur_usm_dditable_t - *pDdiTable ///< [in,out] pointer to table of DDI function pointers -) { - auto retVal = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != retVal) { - return retVal; +UR_APIEXPORT ur_result_t UR_APICALL urGetPhysicalMemProcAddrTable( + ur_api_version_t version, ur_physical_mem_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; } - pDdiTable->pfnHostAlloc = urUSMHostAlloc; - pDdiTable->pfnDeviceAlloc = urUSMDeviceAlloc; - pDdiTable->pfnSharedAlloc = urUSMSharedAlloc; - pDdiTable->pfnFree = urUSMFree; - pDdiTable->pfnGetMemAllocInfo = urUSMGetMemAllocInfo; - pDdiTable->pfnPoolCreate = urUSMPoolCreate; - pDdiTable->pfnPoolRetain = urUSMPoolRetain; - pDdiTable->pfnPoolRelease = urUSMPoolRelease; - pDdiTable->pfnPoolGetInfo = urUSMPoolGetInfo; - - return retVal; + pDdiTable->pfnCreate = ur::level_zero::urPhysicalMemCreate; + pDdiTable->pfnRetain = ur::level_zero::urPhysicalMemRetain; + pDdiTable->pfnRelease = ur::level_zero::urPhysicalMemRelease; + + return result; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( - ur_api_version_t version, ///< [in] API version requested - ur_device_dditable_t - *pDdiTable ///< [in,out] pointer to table of DDI function pointers -) { - auto retVal = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != retVal) { - return retVal; +UR_APIEXPORT ur_result_t UR_APICALL urGetPlatformProcAddrTable( + ur_api_version_t version, ur_platform_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; } - pDdiTable->pfnGet = urDeviceGet; - pDdiTable->pfnGetInfo = urDeviceGetInfo; - pDdiTable->pfnRetain = urDeviceRetain; - pDdiTable->pfnRelease = urDeviceRelease; - pDdiTable->pfnPartition = urDevicePartition; - pDdiTable->pfnSelectBinary = urDeviceSelectBinary; - pDdiTable->pfnGetNativeHandle = urDeviceGetNativeHandle; - pDdiTable->pfnCreateWithNativeHandle = urDeviceCreateWithNativeHandle; - pDdiTable->pfnGetGlobalTimestamps = urDeviceGetGlobalTimestamps; - - return retVal; + + pDdiTable->pfnGet = ur::level_zero::urPlatformGet; + pDdiTable->pfnGetInfo = ur::level_zero::urPlatformGetInfo; + pDdiTable->pfnGetNativeHandle = ur::level_zero::urPlatformGetNativeHandle; + pDdiTable->pfnCreateWithNativeHandle = + ur::level_zero::urPlatformCreateWithNativeHandle; + pDdiTable->pfnGetApiVersion = ur::level_zero::urPlatformGetApiVersion; + pDdiTable->pfnGetBackendOption = ur::level_zero::urPlatformGetBackendOption; + + return result; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( - ur_api_version_t version, ///< [in] API version requested - ur_command_buffer_exp_dditable_t - *pDdiTable ///< [in,out] pointer to table of DDI function pointers -) { - auto retVal = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != retVal) { - return retVal; +UR_APIEXPORT ur_result_t UR_APICALL urGetProgramProcAddrTable( + ur_api_version_t version, ur_program_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; } - pDdiTable->pfnCreateExp = urCommandBufferCreateExp; - pDdiTable->pfnRetainExp = urCommandBufferRetainExp; - pDdiTable->pfnReleaseExp = urCommandBufferReleaseExp; - pDdiTable->pfnFinalizeExp = urCommandBufferFinalizeExp; - pDdiTable->pfnAppendKernelLaunchExp = urCommandBufferAppendKernelLaunchExp; - pDdiTable->pfnAppendUSMMemcpyExp = urCommandBufferAppendUSMMemcpyExp; - pDdiTable->pfnAppendUSMFillExp = urCommandBufferAppendUSMFillExp; - pDdiTable->pfnAppendMemBufferCopyExp = urCommandBufferAppendMemBufferCopyExp; - pDdiTable->pfnAppendMemBufferCopyRectExp = - urCommandBufferAppendMemBufferCopyRectExp; - pDdiTable->pfnAppendMemBufferReadExp = urCommandBufferAppendMemBufferReadExp; - pDdiTable->pfnAppendMemBufferReadRectExp = - urCommandBufferAppendMemBufferReadRectExp; - pDdiTable->pfnAppendMemBufferWriteExp = - urCommandBufferAppendMemBufferWriteExp; - pDdiTable->pfnAppendMemBufferWriteRectExp = - urCommandBufferAppendMemBufferWriteRectExp; - pDdiTable->pfnAppendUSMPrefetchExp = urCommandBufferAppendUSMPrefetchExp; - pDdiTable->pfnAppendUSMAdviseExp = urCommandBufferAppendUSMAdviseExp; - pDdiTable->pfnAppendMemBufferFillExp = urCommandBufferAppendMemBufferFillExp; - pDdiTable->pfnEnqueueExp = urCommandBufferEnqueueExp; - pDdiTable->pfnUpdateKernelLaunchExp = urCommandBufferUpdateKernelLaunchExp; - pDdiTable->pfnGetInfoExp = urCommandBufferGetInfoExp; - pDdiTable->pfnCommandGetInfoExp = urCommandBufferCommandGetInfoExp; - pDdiTable->pfnReleaseCommandExp = urCommandBufferReleaseCommandExp; - pDdiTable->pfnRetainCommandExp = urCommandBufferRetainCommandExp; - - return retVal; + + pDdiTable->pfnCreateWithIL = ur::level_zero::urProgramCreateWithIL; + pDdiTable->pfnCreateWithBinary = ur::level_zero::urProgramCreateWithBinary; + pDdiTable->pfnBuild = ur::level_zero::urProgramBuild; + pDdiTable->pfnCompile = ur::level_zero::urProgramCompile; + pDdiTable->pfnLink = ur::level_zero::urProgramLink; + pDdiTable->pfnRetain = ur::level_zero::urProgramRetain; + pDdiTable->pfnRelease = ur::level_zero::urProgramRelease; + pDdiTable->pfnGetFunctionPointer = + ur::level_zero::urProgramGetFunctionPointer; + pDdiTable->pfnGetGlobalVariablePointer = + ur::level_zero::urProgramGetGlobalVariablePointer; + pDdiTable->pfnGetInfo = ur::level_zero::urProgramGetInfo; + pDdiTable->pfnGetBuildInfo = ur::level_zero::urProgramGetBuildInfo; + pDdiTable->pfnSetSpecializationConstants = + ur::level_zero::urProgramSetSpecializationConstants; + pDdiTable->pfnGetNativeHandle = ur::level_zero::urProgramGetNativeHandle; + pDdiTable->pfnCreateWithNativeHandle = + ur::level_zero::urProgramCreateWithNativeHandle; + + return result; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetUsmP2PExpProcAddrTable( - ur_api_version_t version, ur_usm_p2p_exp_dditable_t *pDdiTable) { - auto retVal = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != retVal) { - return retVal; +UR_APIEXPORT ur_result_t UR_APICALL urGetProgramExpProcAddrTable( + ur_api_version_t version, ur_program_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; } - pDdiTable->pfnEnablePeerAccessExp = urUsmP2PEnablePeerAccessExp; - pDdiTable->pfnDisablePeerAccessExp = urUsmP2PDisablePeerAccessExp; - pDdiTable->pfnPeerAccessGetInfoExp = urUsmP2PPeerAccessGetInfoExp; - return retVal; + pDdiTable->pfnBuildExp = ur::level_zero::urProgramBuildExp; + pDdiTable->pfnCompileExp = ur::level_zero::urProgramCompileExp; + pDdiTable->pfnLinkExp = ur::level_zero::urProgramLinkExp; + + return result; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable( - ur_api_version_t version, ur_bindless_images_exp_dditable_t *pDdiTable) { +UR_APIEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( + ur_api_version_t version, ur_queue_dditable_t *pDdiTable) { auto result = validateProcInputs(version, pDdiTable); if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnUnsampledImageHandleDestroyExp = - urBindlessImagesUnsampledImageHandleDestroyExp; - pDdiTable->pfnSampledImageHandleDestroyExp = - urBindlessImagesSampledImageHandleDestroyExp; - pDdiTable->pfnImageAllocateExp = urBindlessImagesImageAllocateExp; - pDdiTable->pfnImageFreeExp = urBindlessImagesImageFreeExp; - pDdiTable->pfnUnsampledImageCreateExp = - urBindlessImagesUnsampledImageCreateExp; - pDdiTable->pfnSampledImageCreateExp = urBindlessImagesSampledImageCreateExp; - pDdiTable->pfnImageCopyExp = urBindlessImagesImageCopyExp; - pDdiTable->pfnImageGetInfoExp = urBindlessImagesImageGetInfoExp; - pDdiTable->pfnMipmapGetLevelExp = urBindlessImagesMipmapGetLevelExp; - pDdiTable->pfnMipmapFreeExp = urBindlessImagesMipmapFreeExp; - pDdiTable->pfnImportExternalMemoryExp = - urBindlessImagesImportExternalMemoryExp; - pDdiTable->pfnMapExternalArrayExp = urBindlessImagesMapExternalArrayExp; - pDdiTable->pfnReleaseInteropExp = urBindlessImagesReleaseInteropExp; - pDdiTable->pfnImportExternalSemaphoreExp = - urBindlessImagesImportExternalSemaphoreExp; - pDdiTable->pfnReleaseExternalSemaphoreExp = - urBindlessImagesReleaseExternalSemaphoreExp; - pDdiTable->pfnWaitExternalSemaphoreExp = - urBindlessImagesWaitExternalSemaphoreExp; - pDdiTable->pfnSignalExternalSemaphoreExp = - urBindlessImagesSignalExternalSemaphoreExp; - return UR_RESULT_SUCCESS; + + pDdiTable->pfnGetInfo = ur::level_zero::urQueueGetInfo; + pDdiTable->pfnCreate = ur::level_zero::urQueueCreate; + pDdiTable->pfnRetain = ur::level_zero::urQueueRetain; + pDdiTable->pfnRelease = ur::level_zero::urQueueRelease; + pDdiTable->pfnGetNativeHandle = ur::level_zero::urQueueGetNativeHandle; + pDdiTable->pfnCreateWithNativeHandle = + ur::level_zero::urQueueCreateWithNativeHandle; + pDdiTable->pfnFinish = ur::level_zero::urQueueFinish; + pDdiTable->pfnFlush = ur::level_zero::urQueueFlush; + + return result; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( - ur_api_version_t version, ur_usm_exp_dditable_t *pDdiTable) { +UR_APIEXPORT ur_result_t UR_APICALL urGetSamplerProcAddrTable( + ur_api_version_t version, ur_sampler_dditable_t *pDdiTable) { auto result = validateProcInputs(version, pDdiTable); if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnPitchedAllocExp = urUSMPitchedAllocExp; - pDdiTable->pfnImportExp = urUSMImportExp; - pDdiTable->pfnReleaseExp = urUSMReleaseExp; - return UR_RESULT_SUCCESS; + + pDdiTable->pfnCreate = ur::level_zero::urSamplerCreate; + pDdiTable->pfnRetain = ur::level_zero::urSamplerRetain; + pDdiTable->pfnRelease = ur::level_zero::urSamplerRelease; + pDdiTable->pfnGetInfo = ur::level_zero::urSamplerGetInfo; + pDdiTable->pfnGetNativeHandle = ur::level_zero::urSamplerGetNativeHandle; + pDdiTable->pfnCreateWithNativeHandle = + ur::level_zero::urSamplerCreateWithNativeHandle; + + return result; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetVirtualMemProcAddrTable( - ur_api_version_t version, ///< [in] API version requested - ur_virtual_mem_dditable_t - *pDdiTable ///< [in,out] pointer to table of DDI function pointers -) { - auto retVal = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != retVal) { - return retVal; +UR_APIEXPORT ur_result_t UR_APICALL urGetTensorMapExpProcAddrTable( + ur_api_version_t version, ur_tensor_map_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; } - pDdiTable->pfnFree = urVirtualMemFree; - pDdiTable->pfnGetInfo = urVirtualMemGetInfo; - pDdiTable->pfnGranularityGetInfo = urVirtualMemGranularityGetInfo; - pDdiTable->pfnMap = urVirtualMemMap; - pDdiTable->pfnReserve = urVirtualMemReserve; - pDdiTable->pfnSetAccess = urVirtualMemSetAccess; - pDdiTable->pfnUnmap = urVirtualMemUnmap; + pDdiTable->pfnEncodeIm2ColExp = ur::level_zero::urTensorMapEncodeIm2ColExp; + pDdiTable->pfnEncodeTiledExp = ur::level_zero::urTensorMapEncodeTiledExp; - return retVal; + return result; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetPhysicalMemProcAddrTable( - ur_api_version_t version, ///< [in] API version requested - ur_physical_mem_dditable_t - *pDdiTable ///< [in,out] pointer to table of DDI function pointers -) { - auto retVal = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != retVal) { - return retVal; +UR_APIEXPORT ur_result_t UR_APICALL +urGetUSMProcAddrTable(ur_api_version_t version, ur_usm_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; } - pDdiTable->pfnCreate = urPhysicalMemCreate; - pDdiTable->pfnRelease = urPhysicalMemRelease; - pDdiTable->pfnRetain = urPhysicalMemRetain; - - return retVal; + pDdiTable->pfnHostAlloc = ur::level_zero::urUSMHostAlloc; + pDdiTable->pfnDeviceAlloc = ur::level_zero::urUSMDeviceAlloc; + pDdiTable->pfnSharedAlloc = ur::level_zero::urUSMSharedAlloc; + pDdiTable->pfnFree = ur::level_zero::urUSMFree; + pDdiTable->pfnGetMemAllocInfo = ur::level_zero::urUSMGetMemAllocInfo; + pDdiTable->pfnPoolCreate = ur::level_zero::urUSMPoolCreate; + pDdiTable->pfnPoolRetain = ur::level_zero::urUSMPoolRetain; + pDdiTable->pfnPoolRelease = ur::level_zero::urUSMPoolRelease; + pDdiTable->pfnPoolGetInfo = ur::level_zero::urUSMPoolGetInfo; + + return result; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( - ur_api_version_t version, ur_enqueue_exp_dditable_t *pDdiTable) { +UR_APIEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( + ur_api_version_t version, ur_usm_exp_dditable_t *pDdiTable) { auto result = validateProcInputs(version, pDdiTable); if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnCooperativeKernelLaunchExp = - urEnqueueCooperativeKernelLaunchExp; - pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; - pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; + pDdiTable->pfnPitchedAllocExp = ur::level_zero::urUSMPitchedAllocExp; + pDdiTable->pfnImportExp = ur::level_zero::urUSMImportExp; + pDdiTable->pfnReleaseExp = ur::level_zero::urUSMReleaseExp; - return UR_RESULT_SUCCESS; + return result; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable( - ur_api_version_t version, ur_kernel_exp_dditable_t *pDdiTable) { +UR_APIEXPORT ur_result_t UR_APICALL urGetUsmP2PExpProcAddrTable( + ur_api_version_t version, ur_usm_p2p_exp_dditable_t *pDdiTable) { auto result = validateProcInputs(version, pDdiTable); if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnSuggestMaxCooperativeGroupCountExp = - urKernelSuggestMaxCooperativeGroupCountExp; + pDdiTable->pfnEnablePeerAccessExp = + ur::level_zero::urUsmP2PEnablePeerAccessExp; + pDdiTable->pfnDisablePeerAccessExp = + ur::level_zero::urUsmP2PDisablePeerAccessExp; + pDdiTable->pfnPeerAccessGetInfoExp = + ur::level_zero::urUsmP2PPeerAccessGetInfoExp; - return UR_RESULT_SUCCESS; + return result; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramExpProcAddrTable( - ur_api_version_t version, ur_program_exp_dditable_t *pDdiTable) { +UR_APIEXPORT ur_result_t UR_APICALL urGetVirtualMemProcAddrTable( + ur_api_version_t version, ur_virtual_mem_dditable_t *pDdiTable) { auto result = validateProcInputs(version, pDdiTable); if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnBuildExp = urProgramBuildExp; - pDdiTable->pfnCompileExp = urProgramCompileExp; - pDdiTable->pfnLinkExp = urProgramLinkExp; + pDdiTable->pfnGranularityGetInfo = + ur::level_zero::urVirtualMemGranularityGetInfo; + pDdiTable->pfnReserve = ur::level_zero::urVirtualMemReserve; + pDdiTable->pfnFree = ur::level_zero::urVirtualMemFree; + pDdiTable->pfnMap = ur::level_zero::urVirtualMemMap; + pDdiTable->pfnUnmap = ur::level_zero::urVirtualMemUnmap; + pDdiTable->pfnSetAccess = ur::level_zero::urVirtualMemSetAccess; + pDdiTable->pfnGetInfo = ur::level_zero::urVirtualMemGetInfo; - return UR_RESULT_SUCCESS; + return result; } -#if defined(__cplusplus) + +UR_APIEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( + ur_api_version_t version, ur_device_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + + pDdiTable->pfnGet = ur::level_zero::urDeviceGet; + pDdiTable->pfnGetInfo = ur::level_zero::urDeviceGetInfo; + pDdiTable->pfnRetain = ur::level_zero::urDeviceRetain; + pDdiTable->pfnRelease = ur::level_zero::urDeviceRelease; + pDdiTable->pfnPartition = ur::level_zero::urDevicePartition; + pDdiTable->pfnSelectBinary = ur::level_zero::urDeviceSelectBinary; + pDdiTable->pfnGetNativeHandle = ur::level_zero::urDeviceGetNativeHandle; + pDdiTable->pfnCreateWithNativeHandle = + ur::level_zero::urDeviceCreateWithNativeHandle; + pDdiTable->pfnGetGlobalTimestamps = + ur::level_zero::urDeviceGetGlobalTimestamps; + + return result; +} + +#ifdef UR_STATIC_ADAPTER_LEVEL_ZERO +} // namespace ur::level_zero +#elif defined(__cplusplus) } // extern "C" #endif + +#ifdef UR_STATIC_ADAPTER_LEVEL_ZERO +namespace ur::level_zero { +ur_result_t urAdapterGetDdiTables(ur_dditable_t *ddi) { + if (ddi == nullptr) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + ur_result_t result; + + result = ur::level_zero::urGetGlobalProcAddrTable(UR_API_VERSION_CURRENT, + &ddi->Global); + if (result != UR_RESULT_SUCCESS) + return result; + result = ur::level_zero::urGetBindlessImagesExpProcAddrTable( + UR_API_VERSION_CURRENT, &ddi->BindlessImagesExp); + if (result != UR_RESULT_SUCCESS) + return result; + result = ur::level_zero::urGetCommandBufferExpProcAddrTable( + UR_API_VERSION_CURRENT, &ddi->CommandBufferExp); + if (result != UR_RESULT_SUCCESS) + return result; + result = ur::level_zero::urGetContextProcAddrTable(UR_API_VERSION_CURRENT, + &ddi->Context); + if (result != UR_RESULT_SUCCESS) + return result; + result = ur::level_zero::urGetEnqueueProcAddrTable(UR_API_VERSION_CURRENT, + &ddi->Enqueue); + if (result != UR_RESULT_SUCCESS) + return result; + result = ur::level_zero::urGetEnqueueExpProcAddrTable(UR_API_VERSION_CURRENT, + &ddi->EnqueueExp); + if (result != UR_RESULT_SUCCESS) + return result; + result = ur::level_zero::urGetEventProcAddrTable(UR_API_VERSION_CURRENT, + &ddi->Event); + if (result != UR_RESULT_SUCCESS) + return result; + result = ur::level_zero::urGetKernelProcAddrTable(UR_API_VERSION_CURRENT, + &ddi->Kernel); + if (result != UR_RESULT_SUCCESS) + return result; + result = ur::level_zero::urGetKernelExpProcAddrTable(UR_API_VERSION_CURRENT, + &ddi->KernelExp); + if (result != UR_RESULT_SUCCESS) + return result; + result = + ur::level_zero::urGetMemProcAddrTable(UR_API_VERSION_CURRENT, &ddi->Mem); + if (result != UR_RESULT_SUCCESS) + return result; + result = ur::level_zero::urGetPhysicalMemProcAddrTable(UR_API_VERSION_CURRENT, + &ddi->PhysicalMem); + if (result != UR_RESULT_SUCCESS) + return result; + result = ur::level_zero::urGetPlatformProcAddrTable(UR_API_VERSION_CURRENT, + &ddi->Platform); + if (result != UR_RESULT_SUCCESS) + return result; + result = ur::level_zero::urGetProgramProcAddrTable(UR_API_VERSION_CURRENT, + &ddi->Program); + if (result != UR_RESULT_SUCCESS) + return result; + result = ur::level_zero::urGetProgramExpProcAddrTable(UR_API_VERSION_CURRENT, + &ddi->ProgramExp); + if (result != UR_RESULT_SUCCESS) + return result; + result = ur::level_zero::urGetQueueProcAddrTable(UR_API_VERSION_CURRENT, + &ddi->Queue); + if (result != UR_RESULT_SUCCESS) + return result; + result = ur::level_zero::urGetSamplerProcAddrTable(UR_API_VERSION_CURRENT, + &ddi->Sampler); + if (result != UR_RESULT_SUCCESS) + return result; + result = ur::level_zero::urGetTensorMapExpProcAddrTable( + UR_API_VERSION_CURRENT, &ddi->TensorMapExp); + if (result != UR_RESULT_SUCCESS) + return result; + result = + ur::level_zero::urGetUSMProcAddrTable(UR_API_VERSION_CURRENT, &ddi->USM); + if (result != UR_RESULT_SUCCESS) + return result; + result = ur::level_zero::urGetUSMExpProcAddrTable(UR_API_VERSION_CURRENT, + &ddi->USMExp); + if (result != UR_RESULT_SUCCESS) + return result; + result = ur::level_zero::urGetUsmP2PExpProcAddrTable(UR_API_VERSION_CURRENT, + &ddi->UsmP2PExp); + if (result != UR_RESULT_SUCCESS) + return result; + result = ur::level_zero::urGetVirtualMemProcAddrTable(UR_API_VERSION_CURRENT, + &ddi->VirtualMem); + if (result != UR_RESULT_SUCCESS) + return result; + result = ur::level_zero::urGetDeviceProcAddrTable(UR_API_VERSION_CURRENT, + &ddi->Device); + if (result != UR_RESULT_SUCCESS) + return result; + + return result; +} +} // namespace ur::level_zero +#endif diff --git a/source/adapters/level_zero/ur_interface_loader.hpp b/source/adapters/level_zero/ur_interface_loader.hpp new file mode 100644 index 0000000000..e607fbad0c --- /dev/null +++ b/source/adapters/level_zero/ur_interface_loader.hpp @@ -0,0 +1,760 @@ +//===--------- ur_interface_loader.hpp - Level Zero Adapter ------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include +#include + +namespace ur::level_zero { +ur_result_t urAdapterGet(uint32_t NumEntries, ur_adapter_handle_t *phAdapters, + uint32_t *pNumAdapters); +ur_result_t urAdapterRelease(ur_adapter_handle_t hAdapter); +ur_result_t urAdapterRetain(ur_adapter_handle_t hAdapter); +ur_result_t urAdapterGetLastError(ur_adapter_handle_t hAdapter, + const char **ppMessage, int32_t *pError); +ur_result_t urAdapterGetInfo(ur_adapter_handle_t hAdapter, + ur_adapter_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet); +ur_result_t urPlatformGet(ur_adapter_handle_t *phAdapters, uint32_t NumAdapters, + uint32_t NumEntries, + ur_platform_handle_t *phPlatforms, + uint32_t *pNumPlatforms); +ur_result_t urPlatformGetInfo(ur_platform_handle_t hPlatform, + ur_platform_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet); +ur_result_t urPlatformGetApiVersion(ur_platform_handle_t hPlatform, + ur_api_version_t *pVersion); +ur_result_t urPlatformGetNativeHandle(ur_platform_handle_t hPlatform, + ur_native_handle_t *phNativePlatform); +ur_result_t urPlatformCreateWithNativeHandle( + ur_native_handle_t hNativePlatform, ur_adapter_handle_t hAdapter, + const ur_platform_native_properties_t *pProperties, + ur_platform_handle_t *phPlatform); +ur_result_t urPlatformGetBackendOption(ur_platform_handle_t hPlatform, + const char *pFrontendOption, + const char **ppPlatformOption); +ur_result_t urDeviceGet(ur_platform_handle_t hPlatform, + ur_device_type_t DeviceType, uint32_t NumEntries, + ur_device_handle_t *phDevices, uint32_t *pNumDevices); +ur_result_t urDeviceGetInfo(ur_device_handle_t hDevice, + ur_device_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet); +ur_result_t urDeviceRetain(ur_device_handle_t hDevice); +ur_result_t urDeviceRelease(ur_device_handle_t hDevice); +ur_result_t +urDevicePartition(ur_device_handle_t hDevice, + const ur_device_partition_properties_t *pProperties, + uint32_t NumDevices, ur_device_handle_t *phSubDevices, + uint32_t *pNumDevicesRet); +ur_result_t urDeviceSelectBinary(ur_device_handle_t hDevice, + const ur_device_binary_t *pBinaries, + uint32_t NumBinaries, + uint32_t *pSelectedBinary); +ur_result_t urDeviceGetNativeHandle(ur_device_handle_t hDevice, + ur_native_handle_t *phNativeDevice); +ur_result_t +urDeviceCreateWithNativeHandle(ur_native_handle_t hNativeDevice, + ur_adapter_handle_t hAdapter, + const ur_device_native_properties_t *pProperties, + ur_device_handle_t *phDevice); +ur_result_t urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice, + uint64_t *pDeviceTimestamp, + uint64_t *pHostTimestamp); +ur_result_t urContextCreate(uint32_t DeviceCount, + const ur_device_handle_t *phDevices, + const ur_context_properties_t *pProperties, + ur_context_handle_t *phContext); +ur_result_t urContextRetain(ur_context_handle_t hContext); +ur_result_t urContextRelease(ur_context_handle_t hContext); +ur_result_t urContextGetInfo(ur_context_handle_t hContext, + ur_context_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet); +ur_result_t urContextGetNativeHandle(ur_context_handle_t hContext, + ur_native_handle_t *phNativeContext); +ur_result_t urContextCreateWithNativeHandle( + ur_native_handle_t hNativeContext, ur_adapter_handle_t hAdapter, + uint32_t numDevices, const ur_device_handle_t *phDevices, + const ur_context_native_properties_t *pProperties, + ur_context_handle_t *phContext); +ur_result_t +urContextSetExtendedDeleter(ur_context_handle_t hContext, + ur_context_extended_deleter_t pfnDeleter, + void *pUserData); +ur_result_t urMemImageCreate(ur_context_handle_t hContext, ur_mem_flags_t flags, + const ur_image_format_t *pImageFormat, + const ur_image_desc_t *pImageDesc, void *pHost, + ur_mem_handle_t *phMem); +ur_result_t urMemBufferCreate(ur_context_handle_t hContext, + ur_mem_flags_t flags, size_t size, + const ur_buffer_properties_t *pProperties, + ur_mem_handle_t *phBuffer); +ur_result_t urMemRetain(ur_mem_handle_t hMem); +ur_result_t urMemRelease(ur_mem_handle_t hMem); +ur_result_t urMemBufferPartition(ur_mem_handle_t hBuffer, ur_mem_flags_t flags, + ur_buffer_create_type_t bufferCreateType, + const ur_buffer_region_t *pRegion, + ur_mem_handle_t *phMem); +ur_result_t urMemGetNativeHandle(ur_mem_handle_t hMem, + ur_device_handle_t hDevice, + ur_native_handle_t *phNativeMem); +ur_result_t urMemBufferCreateWithNativeHandle( + ur_native_handle_t hNativeMem, ur_context_handle_t hContext, + const ur_mem_native_properties_t *pProperties, ur_mem_handle_t *phMem); +ur_result_t urMemImageCreateWithNativeHandle( + ur_native_handle_t hNativeMem, ur_context_handle_t hContext, + const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, + const ur_mem_native_properties_t *pProperties, ur_mem_handle_t *phMem); +ur_result_t urMemGetInfo(ur_mem_handle_t hMemory, ur_mem_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet); +ur_result_t urMemImageGetInfo(ur_mem_handle_t hMemory, ur_image_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet); +ur_result_t urSamplerCreate(ur_context_handle_t hContext, + const ur_sampler_desc_t *pDesc, + ur_sampler_handle_t *phSampler); +ur_result_t urSamplerRetain(ur_sampler_handle_t hSampler); +ur_result_t urSamplerRelease(ur_sampler_handle_t hSampler); +ur_result_t urSamplerGetInfo(ur_sampler_handle_t hSampler, + ur_sampler_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet); +ur_result_t urSamplerGetNativeHandle(ur_sampler_handle_t hSampler, + ur_native_handle_t *phNativeSampler); +ur_result_t urSamplerCreateWithNativeHandle( + ur_native_handle_t hNativeSampler, ur_context_handle_t hContext, + const ur_sampler_native_properties_t *pProperties, + ur_sampler_handle_t *phSampler); +ur_result_t urUSMHostAlloc(ur_context_handle_t hContext, + const ur_usm_desc_t *pUSMDesc, + ur_usm_pool_handle_t pool, size_t size, + void **ppMem); +ur_result_t urUSMDeviceAlloc(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + const ur_usm_desc_t *pUSMDesc, + ur_usm_pool_handle_t pool, size_t size, + void **ppMem); +ur_result_t urUSMSharedAlloc(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + const ur_usm_desc_t *pUSMDesc, + ur_usm_pool_handle_t pool, size_t size, + void **ppMem); +ur_result_t urUSMFree(ur_context_handle_t hContext, void *pMem); +ur_result_t urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem, + ur_usm_alloc_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet); +ur_result_t urUSMPoolCreate(ur_context_handle_t hContext, + ur_usm_pool_desc_t *pPoolDesc, + ur_usm_pool_handle_t *ppPool); +ur_result_t urUSMPoolRetain(ur_usm_pool_handle_t pPool); +ur_result_t urUSMPoolRelease(ur_usm_pool_handle_t pPool); +ur_result_t urUSMPoolGetInfo(ur_usm_pool_handle_t hPool, + ur_usm_pool_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet); +ur_result_t urVirtualMemGranularityGetInfo( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_virtual_mem_granularity_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet); +ur_result_t urVirtualMemReserve(ur_context_handle_t hContext, + const void *pStart, size_t size, + void **ppStart); +ur_result_t urVirtualMemFree(ur_context_handle_t hContext, const void *pStart, + size_t size); +ur_result_t urVirtualMemMap(ur_context_handle_t hContext, const void *pStart, + size_t size, ur_physical_mem_handle_t hPhysicalMem, + size_t offset, ur_virtual_mem_access_flags_t flags); +ur_result_t urVirtualMemUnmap(ur_context_handle_t hContext, const void *pStart, + size_t size); +ur_result_t urVirtualMemSetAccess(ur_context_handle_t hContext, + const void *pStart, size_t size, + ur_virtual_mem_access_flags_t flags); +ur_result_t urVirtualMemGetInfo(ur_context_handle_t hContext, + const void *pStart, size_t size, + ur_virtual_mem_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet); +ur_result_t urPhysicalMemCreate(ur_context_handle_t hContext, + ur_device_handle_t hDevice, size_t size, + const ur_physical_mem_properties_t *pProperties, + ur_physical_mem_handle_t *phPhysicalMem); +ur_result_t urPhysicalMemRetain(ur_physical_mem_handle_t hPhysicalMem); +ur_result_t urPhysicalMemRelease(ur_physical_mem_handle_t hPhysicalMem); +ur_result_t urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, + size_t length, + const ur_program_properties_t *pProperties, + ur_program_handle_t *phProgram); +ur_result_t urProgramCreateWithBinary( + ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size, + const uint8_t *pBinary, const ur_program_properties_t *pProperties, + ur_program_handle_t *phProgram); +ur_result_t urProgramBuild(ur_context_handle_t hContext, + ur_program_handle_t hProgram, const char *pOptions); +ur_result_t urProgramCompile(ur_context_handle_t hContext, + ur_program_handle_t hProgram, + const char *pOptions); +ur_result_t urProgramLink(ur_context_handle_t hContext, uint32_t count, + const ur_program_handle_t *phPrograms, + const char *pOptions, ur_program_handle_t *phProgram); +ur_result_t urProgramRetain(ur_program_handle_t hProgram); +ur_result_t urProgramRelease(ur_program_handle_t hProgram); +ur_result_t urProgramGetFunctionPointer(ur_device_handle_t hDevice, + ur_program_handle_t hProgram, + const char *pFunctionName, + void **ppFunctionPointer); +ur_result_t urProgramGetGlobalVariablePointer( + ur_device_handle_t hDevice, ur_program_handle_t hProgram, + const char *pGlobalVariableName, size_t *pGlobalVariableSizeRet, + void **ppGlobalVariablePointerRet); +ur_result_t urProgramGetInfo(ur_program_handle_t hProgram, + ur_program_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet); +ur_result_t urProgramGetBuildInfo(ur_program_handle_t hProgram, + ur_device_handle_t hDevice, + ur_program_build_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet); +ur_result_t urProgramSetSpecializationConstants( + ur_program_handle_t hProgram, uint32_t count, + const ur_specialization_constant_info_t *pSpecConstants); +ur_result_t urProgramGetNativeHandle(ur_program_handle_t hProgram, + ur_native_handle_t *phNativeProgram); +ur_result_t urProgramCreateWithNativeHandle( + ur_native_handle_t hNativeProgram, ur_context_handle_t hContext, + const ur_program_native_properties_t *pProperties, + ur_program_handle_t *phProgram); +ur_result_t urKernelCreate(ur_program_handle_t hProgram, + const char *pKernelName, + ur_kernel_handle_t *phKernel); +ur_result_t urKernelSetArgValue( + ur_kernel_handle_t hKernel, uint32_t argIndex, size_t argSize, + const ur_kernel_arg_value_properties_t *pProperties, const void *pArgValue); +ur_result_t +urKernelSetArgLocal(ur_kernel_handle_t hKernel, uint32_t argIndex, + size_t argSize, + const ur_kernel_arg_local_properties_t *pProperties); +ur_result_t urKernelGetInfo(ur_kernel_handle_t hKernel, + ur_kernel_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet); +ur_result_t urKernelGetGroupInfo(ur_kernel_handle_t hKernel, + ur_device_handle_t hDevice, + ur_kernel_group_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet); +ur_result_t urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, + ur_device_handle_t hDevice, + ur_kernel_sub_group_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet); +ur_result_t urKernelRetain(ur_kernel_handle_t hKernel); +ur_result_t urKernelRelease(ur_kernel_handle_t hKernel); +ur_result_t +urKernelSetArgPointer(ur_kernel_handle_t hKernel, uint32_t argIndex, + const ur_kernel_arg_pointer_properties_t *pProperties, + const void *pArgValue); +ur_result_t +urKernelSetExecInfo(ur_kernel_handle_t hKernel, ur_kernel_exec_info_t propName, + size_t propSize, + const ur_kernel_exec_info_properties_t *pProperties, + const void *pPropValue); +ur_result_t +urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex, + const ur_kernel_arg_sampler_properties_t *pProperties, + ur_sampler_handle_t hArgValue); +ur_result_t +urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, + const ur_kernel_arg_mem_obj_properties_t *pProperties, + ur_mem_handle_t hArgValue); +ur_result_t urKernelSetSpecializationConstants( + ur_kernel_handle_t hKernel, uint32_t count, + const ur_specialization_constant_info_t *pSpecConstants); +ur_result_t urKernelGetNativeHandle(ur_kernel_handle_t hKernel, + ur_native_handle_t *phNativeKernel); +ur_result_t +urKernelCreateWithNativeHandle(ur_native_handle_t hNativeKernel, + ur_context_handle_t hContext, + ur_program_handle_t hProgram, + const ur_kernel_native_properties_t *pProperties, + ur_kernel_handle_t *phKernel); +ur_result_t urKernelGetSuggestedLocalWorkSize(ur_kernel_handle_t hKernel, + ur_queue_handle_t hQueue, + uint32_t numWorkDim, + const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, + size_t *pSuggestedLocalWorkSize); +ur_result_t urQueueGetInfo(ur_queue_handle_t hQueue, ur_queue_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet); +ur_result_t urQueueCreate(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + const ur_queue_properties_t *pProperties, + ur_queue_handle_t *phQueue); +ur_result_t urQueueRetain(ur_queue_handle_t hQueue); +ur_result_t urQueueRelease(ur_queue_handle_t hQueue); +ur_result_t urQueueGetNativeHandle(ur_queue_handle_t hQueue, + ur_queue_native_desc_t *pDesc, + ur_native_handle_t *phNativeQueue); +ur_result_t urQueueCreateWithNativeHandle( + ur_native_handle_t hNativeQueue, ur_context_handle_t hContext, + ur_device_handle_t hDevice, const ur_queue_native_properties_t *pProperties, + ur_queue_handle_t *phQueue); +ur_result_t urQueueFinish(ur_queue_handle_t hQueue); +ur_result_t urQueueFlush(ur_queue_handle_t hQueue); +ur_result_t urEventGetInfo(ur_event_handle_t hEvent, ur_event_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet); +ur_result_t urEventGetProfilingInfo(ur_event_handle_t hEvent, + ur_profiling_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet); +ur_result_t urEventWait(uint32_t numEvents, + const ur_event_handle_t *phEventWaitList); +ur_result_t urEventRetain(ur_event_handle_t hEvent); +ur_result_t urEventRelease(ur_event_handle_t hEvent); +ur_result_t urEventGetNativeHandle(ur_event_handle_t hEvent, + ur_native_handle_t *phNativeEvent); +ur_result_t +urEventCreateWithNativeHandle(ur_native_handle_t hNativeEvent, + ur_context_handle_t hContext, + const ur_event_native_properties_t *pProperties, + ur_event_handle_t *phEvent); +ur_result_t urEventSetCallback(ur_event_handle_t hEvent, + ur_execution_info_t execStatus, + ur_event_callback_t pfnNotify, void *pUserData); +ur_result_t urEnqueueKernelLaunch( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); +ur_result_t urEnqueueEventsWait(ur_queue_handle_t hQueue, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); +ur_result_t urEnqueueEventsWaitWithBarrier( + ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); +ur_result_t urEnqueueMemBufferRead(ur_queue_handle_t hQueue, + ur_mem_handle_t hBuffer, bool blockingRead, + size_t offset, size_t size, void *pDst, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); +ur_result_t urEnqueueMemBufferWrite( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite, + size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); +ur_result_t urEnqueueMemBufferReadRect( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead, + ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pDst, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); +ur_result_t urEnqueueMemBufferWriteRect( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite, + ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); +ur_result_t urEnqueueMemBufferCopy(ur_queue_handle_t hQueue, + ur_mem_handle_t hBufferSrc, + ur_mem_handle_t hBufferDst, size_t srcOffset, + size_t dstOffset, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); +ur_result_t urEnqueueMemBufferCopyRect( + ur_queue_handle_t hQueue, ur_mem_handle_t hBufferSrc, + ur_mem_handle_t hBufferDst, ur_rect_offset_t srcOrigin, + ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, + size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); +ur_result_t urEnqueueMemBufferFill(ur_queue_handle_t hQueue, + ur_mem_handle_t hBuffer, + const void *pPattern, size_t patternSize, + size_t offset, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); +ur_result_t urEnqueueMemImageRead( + ur_queue_handle_t hQueue, ur_mem_handle_t hImage, bool blockingRead, + ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, + size_t slicePitch, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); +ur_result_t urEnqueueMemImageWrite( + ur_queue_handle_t hQueue, ur_mem_handle_t hImage, bool blockingWrite, + ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, + size_t slicePitch, void *pSrc, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); +ur_result_t +urEnqueueMemImageCopy(ur_queue_handle_t hQueue, ur_mem_handle_t hImageSrc, + ur_mem_handle_t hImageDst, ur_rect_offset_t srcOrigin, + ur_rect_offset_t dstOrigin, ur_rect_region_t region, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); +ur_result_t urEnqueueMemBufferMap(ur_queue_handle_t hQueue, + ur_mem_handle_t hBuffer, bool blockingMap, + ur_map_flags_t mapFlags, size_t offset, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent, void **ppRetMap); +ur_result_t urEnqueueMemUnmap(ur_queue_handle_t hQueue, ur_mem_handle_t hMem, + void *pMappedPtr, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); +ur_result_t urEnqueueUSMFill(ur_queue_handle_t hQueue, void *pMem, + size_t patternSize, const void *pPattern, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); +ur_result_t urEnqueueUSMMemcpy(ur_queue_handle_t hQueue, bool blocking, + void *pDst, const void *pSrc, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); +ur_result_t urEnqueueUSMPrefetch(ur_queue_handle_t hQueue, const void *pMem, + size_t size, ur_usm_migration_flags_t flags, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); +ur_result_t urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, + size_t size, ur_usm_advice_flags_t advice, + ur_event_handle_t *phEvent); +ur_result_t urEnqueueUSMFill2D(ur_queue_handle_t hQueue, void *pMem, + size_t pitch, size_t patternSize, + const void *pPattern, size_t width, + size_t height, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); +ur_result_t urEnqueueUSMMemcpy2D(ur_queue_handle_t hQueue, bool blocking, + void *pDst, size_t dstPitch, const void *pSrc, + size_t srcPitch, size_t width, size_t height, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); +ur_result_t urEnqueueDeviceGlobalVariableWrite( + ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name, + bool blockingWrite, size_t count, size_t offset, const void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); +ur_result_t urEnqueueDeviceGlobalVariableRead( + ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name, + bool blockingRead, size_t count, size_t offset, void *pDst, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); +ur_result_t urEnqueueReadHostPipe(ur_queue_handle_t hQueue, + ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, + void *pDst, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); +ur_result_t urEnqueueWriteHostPipe(ur_queue_handle_t hQueue, + ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, + void *pSrc, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); +ur_result_t urUSMPitchedAllocExp(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + const ur_usm_desc_t *pUSMDesc, + ur_usm_pool_handle_t pool, size_t widthInBytes, + size_t height, size_t elementSizeBytes, + void **ppMem, size_t *pResultPitch); +ur_result_t urBindlessImagesUnsampledImageHandleDestroyExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_image_native_handle_t hImage); +ur_result_t urBindlessImagesSampledImageHandleDestroyExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_image_native_handle_t hImage); +ur_result_t urBindlessImagesImageAllocateExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, + ur_exp_image_mem_native_handle_t *phImageMem); +ur_result_t +urBindlessImagesImageFreeExp(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + ur_exp_image_mem_native_handle_t hImageMem); +ur_result_t urBindlessImagesUnsampledImageCreateExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_image_mem_native_handle_t hImageMem, + const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, + ur_exp_image_native_handle_t *phImage); +ur_result_t urBindlessImagesSampledImageCreateExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_image_mem_native_handle_t hImageMem, + const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, + ur_sampler_handle_t hSampler, ur_exp_image_native_handle_t *phImage); +ur_result_t urBindlessImagesImageCopyExp( + ur_queue_handle_t hQueue, const void *pSrc, void *pDst, + const ur_image_desc_t *pSrcImageDesc, const ur_image_desc_t *pDstImageDesc, + const ur_image_format_t *pSrcImageFormat, + const ur_image_format_t *pDstImageFormat, + ur_exp_image_copy_region_t *pCopyRegion, + ur_exp_image_copy_flags_t imageCopyFlags, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); +ur_result_t urBindlessImagesImageGetInfoExp( + ur_context_handle_t hContext, ur_exp_image_mem_native_handle_t hImageMem, + ur_image_info_t propName, void *pPropValue, size_t *pPropSizeRet); +ur_result_t urBindlessImagesMipmapGetLevelExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_image_mem_native_handle_t hImageMem, uint32_t mipmapLevel, + ur_exp_image_mem_native_handle_t *phImageMem); +ur_result_t +urBindlessImagesMipmapFreeExp(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + ur_exp_image_mem_native_handle_t hMem); +ur_result_t urBindlessImagesImportExternalMemoryExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size, + ur_exp_external_mem_type_t memHandleType, + ur_exp_external_mem_desc_t *pExternalMemDesc, + ur_exp_external_mem_handle_t *phExternalMem); +ur_result_t urBindlessImagesMapExternalArrayExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, + ur_exp_external_mem_handle_t hExternalMem, + ur_exp_image_mem_native_handle_t *phImageMem); +ur_result_t urBindlessImagesMapExternalLinearMemoryExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, uint64_t offset, + uint64_t size, ur_exp_external_mem_handle_t hExternalMem, void **ppRetMem); +ur_result_t urBindlessImagesReleaseExternalMemoryExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_external_mem_handle_t hExternalMem); +ur_result_t urBindlessImagesImportExternalSemaphoreExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_external_semaphore_type_t semHandleType, + ur_exp_external_semaphore_desc_t *pExternalSemaphoreDesc, + ur_exp_external_semaphore_handle_t *phExternalSemaphore); +ur_result_t urBindlessImagesReleaseExternalSemaphoreExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_external_semaphore_handle_t hExternalSemaphore); +ur_result_t urBindlessImagesWaitExternalSemaphoreExp( + ur_queue_handle_t hQueue, ur_exp_external_semaphore_handle_t hSemaphore, + bool hasWaitValue, uint64_t waitValue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); +ur_result_t urBindlessImagesSignalExternalSemaphoreExp( + ur_queue_handle_t hQueue, ur_exp_external_semaphore_handle_t hSemaphore, + bool hasSignalValue, uint64_t signalValue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); +ur_result_t +urCommandBufferCreateExp(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + const ur_exp_command_buffer_desc_t *pCommandBufferDesc, + ur_exp_command_buffer_handle_t *phCommandBuffer); +ur_result_t +urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer); +ur_result_t +urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer); +ur_result_t +urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer); +ur_result_t urCommandBufferAppendKernelLaunchExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_kernel_handle_t hKernel, + uint32_t workDim, const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numKernelAlternatives, ur_kernel_handle_t *phKernelAlternatives, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand); +ur_result_t urCommandBufferAppendUSMMemcpyExp( + ur_exp_command_buffer_handle_t hCommandBuffer, void *pDst, const void *pSrc, + size_t size, uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand); +ur_result_t urCommandBufferAppendUSMFillExp( + ur_exp_command_buffer_handle_t hCommandBuffer, void *pMemory, + const void *pPattern, size_t patternSize, size_t size, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand); +ur_result_t urCommandBufferAppendMemBufferCopyExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hSrcMem, + ur_mem_handle_t hDstMem, size_t srcOffset, size_t dstOffset, size_t size, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand); +ur_result_t urCommandBufferAppendMemBufferWriteExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + size_t offset, size_t size, const void *pSrc, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand); +ur_result_t urCommandBufferAppendMemBufferReadExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + size_t offset, size_t size, void *pDst, uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand); +ur_result_t urCommandBufferAppendMemBufferCopyRectExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hSrcMem, + ur_mem_handle_t hDstMem, ur_rect_offset_t srcOrigin, + ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, + size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand); +ur_result_t urCommandBufferAppendMemBufferWriteRectExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + ur_rect_offset_t bufferOffset, ur_rect_offset_t hostOffset, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand); +ur_result_t urCommandBufferAppendMemBufferReadRectExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + ur_rect_offset_t bufferOffset, ur_rect_offset_t hostOffset, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pDst, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand); +ur_result_t urCommandBufferAppendMemBufferFillExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + const void *pPattern, size_t patternSize, size_t offset, size_t size, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand); +ur_result_t urCommandBufferAppendUSMPrefetchExp( + ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory, + size_t size, ur_usm_migration_flags_t flags, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand); +ur_result_t urCommandBufferAppendUSMAdviseExp( + ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory, + size_t size, ur_usm_advice_flags_t advice, uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand); +ur_result_t urCommandBufferEnqueueExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); +ur_result_t urCommandBufferRetainCommandExp( + ur_exp_command_buffer_command_handle_t hCommand); +ur_result_t urCommandBufferReleaseCommandExp( + ur_exp_command_buffer_command_handle_t hCommand); +ur_result_t urCommandBufferUpdateKernelLaunchExp( + ur_exp_command_buffer_command_handle_t hCommand, + const ur_exp_command_buffer_update_kernel_launch_desc_t + *pUpdateKernelLaunch); +ur_result_t urCommandBufferUpdateSignalEventExp( + ur_exp_command_buffer_command_handle_t hCommand, + ur_event_handle_t *phSignalEvent); +ur_result_t urCommandBufferUpdateWaitEventsExp( + ur_exp_command_buffer_command_handle_t hCommand, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList); +ur_result_t +urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer, + ur_exp_command_buffer_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet); +ur_result_t urCommandBufferCommandGetInfoExp( + ur_exp_command_buffer_command_handle_t hCommand, + ur_exp_command_buffer_command_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet); +ur_result_t urEnqueueCooperativeKernelLaunchExp( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); +ur_result_t urKernelSuggestMaxCooperativeGroupCountExp( + ur_kernel_handle_t hKernel, size_t localWorkSize, + size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet); +ur_result_t urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); +ur_result_t urEnqueueKernelLaunchCustomExp( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numPropsInLaunchPropList, + const ur_exp_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); +ur_result_t urProgramBuildExp(ur_program_handle_t hProgram, uint32_t numDevices, + ur_device_handle_t *phDevices, + const char *pOptions); +ur_result_t urProgramCompileExp(ur_program_handle_t hProgram, + uint32_t numDevices, + ur_device_handle_t *phDevices, + const char *pOptions); +ur_result_t urProgramLinkExp(ur_context_handle_t hContext, uint32_t numDevices, + ur_device_handle_t *phDevices, uint32_t count, + const ur_program_handle_t *phPrograms, + const char *pOptions, + ur_program_handle_t *phProgram); +ur_result_t urUSMImportExp(ur_context_handle_t hContext, void *pMem, + size_t size); +ur_result_t urUSMReleaseExp(ur_context_handle_t hContext, void *pMem); +ur_result_t urUsmP2PEnablePeerAccessExp(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice); +ur_result_t urUsmP2PDisablePeerAccessExp(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice); +ur_result_t urUsmP2PPeerAccessGetInfoExp(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice, + ur_exp_peer_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet); +ur_result_t urEnqueueNativeCommandExp( + ur_queue_handle_t hQueue, + ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data, + uint32_t numMemsInMemList, const ur_mem_handle_t *phMemList, + const ur_exp_enqueue_native_command_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); +ur_result_t urTensorMapEncodeIm2ColExp( + ur_device_handle_t hDevice, + ur_exp_tensor_map_data_type_flags_t TensorMapType, uint32_t TensorRank, + void *GlobalAddress, const uint64_t *GlobalDim, + const uint64_t *GlobalStrides, const int *PixelBoxLowerCorner, + const int *PixelBoxUpperCorner, uint32_t ChannelsPerPixel, + uint32_t PixelsPerColumn, const uint32_t *ElementStrides, + ur_exp_tensor_map_interleave_flags_t Interleave, + ur_exp_tensor_map_swizzle_flags_t Swizzle, + ur_exp_tensor_map_l2_promotion_flags_t L2Promotion, + ur_exp_tensor_map_oob_fill_flags_t OobFill, + ur_exp_tensor_map_handle_t *hTensorMap); +ur_result_t +urTensorMapEncodeTiledExp(ur_device_handle_t hDevice, + ur_exp_tensor_map_data_type_flags_t TensorMapType, + uint32_t TensorRank, void *GlobalAddress, + const uint64_t *GlobalDim, + const uint64_t *GlobalStrides, const uint32_t *BoxDim, + const uint32_t *ElementStrides, + ur_exp_tensor_map_interleave_flags_t Interleave, + ur_exp_tensor_map_swizzle_flags_t Swizzle, + ur_exp_tensor_map_l2_promotion_flags_t L2Promotion, + ur_exp_tensor_map_oob_fill_flags_t OobFill, + ur_exp_tensor_map_handle_t *hTensorMap); +#ifdef UR_STATIC_ADAPTER_LEVEL_ZERO +ur_result_t urAdapterGetDdiTables(ur_dditable_t *ddi); +#endif +} // namespace ur::level_zero diff --git a/source/adapters/level_zero/ur_level_zero.hpp b/source/adapters/level_zero/ur_level_zero.hpp index 096ae076f9..36965c5d58 100644 --- a/source/adapters/level_zero/ur_level_zero.hpp +++ b/source/adapters/level_zero/ur_level_zero.hpp @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include diff --git a/source/adapters/level_zero/usm.cpp b/source/adapters/level_zero/usm.cpp index 2f05bfaa57..28bdf233e8 100644 --- a/source/adapters/level_zero/usm.cpp +++ b/source/adapters/level_zero/usm.cpp @@ -1,6 +1,6 @@ //===--------- usm.cpp - Level Zero Adapter -------------------------------===// // -// Copyright (C) 2023 Intel Corporation +// Copyright (C) 2023-2024 Intel Corporation // // Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM // Exceptions. See LICENSE.TXT @@ -17,10 +17,23 @@ #include "usm.hpp" #include "logger/ur_logger.hpp" +#include "ur_interface_loader.hpp" #include "ur_level_zero.hpp" +#include "ur_util.hpp" #include +namespace umf { +ur_result_t getProviderNativeError(const char *providerName, + int32_t nativeError) { + if (strcmp(providerName, "Level Zero") == 0) { + return ze2urResult(static_cast(nativeError)); + } + + return UR_RESULT_ERROR_UNKNOWN; +} +} // namespace umf + usm::DisjointPoolAllConfigs DisjointPoolConfigInstance = InitializeDisjointPoolConfig(); @@ -152,15 +165,26 @@ static ur_result_t USMAllocationMakeResident( } else { Devices.push_back(Device); if (ForceResidency == USMAllocationForceResidencyType::P2PDevices) { - ze_bool_t P2P; - for (const auto &D : Context->Devices) { - if (D == Device) - continue; - // TODO: Cache P2P devices for a context - ZE2UR_CALL(zeDeviceCanAccessPeer, - (D->ZeDevice, Device->ZeDevice, &P2P)); - if (P2P) - Devices.push_back(D); + // Check if the P2P devices are already cached + auto it = Context->P2PDeviceCache.find(Device); + if (it != Context->P2PDeviceCache.end()) { + // Use cached P2P devices + Devices.insert(Devices.end(), it->second.begin(), it->second.end()); + } else { + // Query for P2P devices and update the cache + std::list P2PDevices; + ze_bool_t P2P; + for (const auto &D : Context->Devices) { + if (D == Device) + continue; + ZE2UR_CALL(zeDeviceCanAccessPeer, + (D->ZeDevice, Device->ZeDevice, &P2P)); + if (P2P) + P2PDevices.push_back(D); + } + // Update the cache + Context->P2PDeviceCache[Device] = P2PDevices; + Devices.insert(Devices.end(), P2PDevices.begin(), P2PDevices.end()); } } } @@ -295,7 +319,9 @@ static ur_result_t USMHostAllocImpl(void **ResultPtr, return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc( +namespace ur::level_zero { + +ur_result_t urUSMHostAlloc( ur_context_handle_t Context, ///< [in] handle of the context object const ur_usm_desc_t *USMDesc, ///< [in][optional] USM memory allocation descriptor @@ -310,8 +336,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc( // L0 supports alignment up to 64KB and silently ignores higher values. // We flag alignment > 64KB as an invalid value. // L0 spec says that alignment values that are not powers of 2 are invalid. - if (Align > 65536 || (Align & (Align - 1)) != 0) - return UR_RESULT_ERROR_INVALID_VALUE; + // If alignment == 0, then we are allowing the L0 driver to choose the + // alignment so no need to check. + if (Align > 0) { + if (Align > 65536 || (Align & (Align - 1)) != 0) + return UR_RESULT_ERROR_INVALID_VALUE; + } ur_platform_handle_t Plt = Context->getPlatform(); // If indirect access tracking is enabled then lock the mutex which is @@ -330,7 +360,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc( // We are going to defer memory release if there are kernels with indirect // access, that is why explicitly retain context to be sure that it is // released after all memory allocations in this context are released. - UR_CALL(urContextRetain(Context)); + UR_CALL(ur::level_zero::urContextRetain(Context)); } else { ContextLock.lock(); } @@ -363,7 +393,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( +ur_result_t urUSMDeviceAlloc( ur_context_handle_t Context, ///< [in] handle of the context object ur_device_handle_t Device, ///< [in] handle of the device object const ur_usm_desc_t @@ -380,8 +410,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( // L0 supports alignment up to 64KB and silently ignores higher values. // We flag alignment > 64KB as an invalid value. // L0 spec says that alignment values that are not powers of 2 are invalid. - if (Alignment > 65536 || (Alignment & (Alignment - 1)) != 0) - return UR_RESULT_ERROR_INVALID_VALUE; + // If alignment == 0, then we are allowing the L0 driver to choose the + // alignment so no need to check. + if (Alignment > 0) { + if (Alignment > 65536 || (Alignment & (Alignment - 1)) != 0) + return UR_RESULT_ERROR_INVALID_VALUE; + } ur_platform_handle_t Plt = Device->Platform; @@ -401,7 +435,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( // We are going to defer memory release if there are kernels with indirect // access, that is why explicitly retain context to be sure that it is // released after all memory allocations in this context are released. - UR_CALL(urContextRetain(Context)); + UR_CALL(ur::level_zero::urContextRetain(Context)); } else { ContextLock.lock(); } @@ -439,7 +473,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( +ur_result_t urUSMSharedAlloc( ur_context_handle_t Context, ///< [in] handle of the context object ur_device_handle_t Device, ///< [in] handle of the device object const ur_usm_desc_t @@ -481,8 +515,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( // L0 supports alignment up to 64KB and silently ignores higher values. // We flag alignment > 64KB as an invalid value. // L0 spec says that alignment values that are not powers of 2 are invalid. - if (Alignment > 65536 || (Alignment & (Alignment - 1)) != 0) - return UR_RESULT_ERROR_INVALID_VALUE; + // If alignment == 0, then we are allowing the L0 driver to choose the + // alignment so no need to check. + if (Alignment > 0) { + if (Alignment > 65536 || (Alignment & (Alignment - 1)) != 0) + return UR_RESULT_ERROR_INVALID_VALUE; + } ur_platform_handle_t Plt = Device->Platform; @@ -500,7 +538,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( // We are going to defer memory release if there are kernels with indirect // access, that is why explicitly retain context to be sure that it is // released after all memory allocations in this context are released. - UR_CALL(urContextRetain(Context)); + UR_CALL(ur::level_zero::urContextRetain(Context)); } umf_memory_pool_handle_t hPoolInternal = nullptr; @@ -542,9 +580,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urUSMFree( - ur_context_handle_t Context, ///< [in] handle of the context object - void *Mem ///< [in] pointer to USM memory object +ur_result_t +urUSMFree(ur_context_handle_t Context, ///< [in] handle of the context object + void *Mem ///< [in] pointer to USM memory object ) { ur_platform_handle_t Plt = Context->getPlatform(); @@ -554,7 +592,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMFree( return USMFreeHelper(Context, Mem); } -UR_APIEXPORT ur_result_t UR_APICALL urUSMGetMemAllocInfo( +ur_result_t urUSMGetMemAllocInfo( ur_context_handle_t Context, ///< [in] handle of the context object const void *Ptr, ///< [in] pointer to USM memory object ur_usm_alloc_info_t @@ -654,6 +692,103 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMGetMemAllocInfo( return UR_RESULT_SUCCESS; } +ur_result_t urUSMPoolCreate( + ur_context_handle_t Context, ///< [in] handle of the context object + ur_usm_pool_desc_t + *PoolDesc, ///< [in] pointer to USM pool descriptor. Can be chained with + ///< ::ur_usm_pool_limits_desc_t + ur_usm_pool_handle_t *Pool ///< [out] pointer to USM memory pool +) { + + try { + *Pool = reinterpret_cast( + new ur_usm_pool_handle_t_(Context, PoolDesc)); + + std::shared_lock ContextLock(Context->Mutex); + Context->UsmPoolHandles.insert(Context->UsmPoolHandles.cend(), *Pool); + + } catch (const UsmAllocationException &Ex) { + return Ex.getError(); + } + return UR_RESULT_SUCCESS; +} + +ur_result_t +urUSMPoolRetain(ur_usm_pool_handle_t Pool ///< [in] pointer to USM memory pool +) { + Pool->RefCount.increment(); + return UR_RESULT_SUCCESS; +} + +ur_result_t +urUSMPoolRelease(ur_usm_pool_handle_t Pool ///< [in] pointer to USM memory pool +) { + if (Pool->RefCount.decrementAndTest()) { + std::shared_lock ContextLock(Pool->Context->Mutex); + Pool->Context->UsmPoolHandles.remove(Pool); + delete Pool; + } + return UR_RESULT_SUCCESS; +} + +ur_result_t urUSMPoolGetInfo( + ur_usm_pool_handle_t Pool, ///< [in] handle of the USM memory pool + ur_usm_pool_info_t PropName, ///< [in] name of the pool property to query + size_t PropSize, ///< [in] size in bytes of the pool property value provided + void *PropValue, ///< [out][typename(propName, propSize)] value of the pool + ///< property + size_t *PropSizeRet ///< [out] size in bytes returned in pool property value +) { + UrReturnHelper ReturnValue(PropSize, PropValue, PropSizeRet); + + switch (PropName) { + case UR_USM_POOL_INFO_REFERENCE_COUNT: { + return ReturnValue(Pool->RefCount.load()); + } + case UR_USM_POOL_INFO_CONTEXT: { + return ReturnValue(Pool->Context); + } + default: { + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + } + } +} + +ur_result_t urUSMImportExp(ur_context_handle_t Context, void *HostPtr, + size_t Size) { + UR_ASSERT(Context, UR_RESULT_ERROR_INVALID_CONTEXT); + + // Promote the host ptr to USM host memory. + if (ZeUSMImport.Supported && HostPtr != nullptr) { + // Query memory type of the host pointer + ze_device_handle_t ZeDeviceHandle; + ZeStruct ZeMemoryAllocationProperties; + ZE2UR_CALL(zeMemGetAllocProperties, + (Context->ZeContext, HostPtr, &ZeMemoryAllocationProperties, + &ZeDeviceHandle)); + + // If not shared of any type, we can import the ptr + if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN) { + // Promote the host ptr to USM host memory + ze_driver_handle_t driverHandle = + Context->getPlatform()->ZeDriverHandleExpTranslated; + ZeUSMImport.doZeUSMImport(driverHandle, HostPtr, Size); + } + } + return UR_RESULT_SUCCESS; +} + +ur_result_t urUSMReleaseExp(ur_context_handle_t Context, void *HostPtr) { + UR_ASSERT(Context, UR_RESULT_ERROR_INVALID_CONTEXT); + + // Release the imported memory. + if (ZeUSMImport.Supported && HostPtr != nullptr) + ZeUSMImport.doZeUSMRelease( + Context->getPlatform()->ZeDriverHandleExpTranslated, HostPtr); + return UR_RESULT_SUCCESS; +} +} // namespace ur::level_zero + static ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Ptr) { auto ZeResult = ZE_CALL_NOCHECK(zeMemFree, (Context->ZeContext, Ptr)); // Handle When the driver is already released @@ -766,6 +901,97 @@ umf_result_t L0MemoryProvider::get_min_page_size(void *Ptr, size_t *PageSize) { return UMF_RESULT_SUCCESS; } +typedef struct ze_ipc_data_t { + int pid; + ze_ipc_mem_handle_t zeHandle; +} ze_ipc_data_t; + +umf_result_t L0MemoryProvider::get_ipc_handle_size(size_t *Size) { + UR_ASSERT(Size, UMF_RESULT_ERROR_INVALID_ARGUMENT); + *Size = sizeof(ze_ipc_data_t); + + return UMF_RESULT_SUCCESS; +} + +umf_result_t L0MemoryProvider::get_ipc_handle(const void *Ptr, size_t Size, + void *IpcData) { + std::ignore = Size; + + UR_ASSERT(Ptr && IpcData, UMF_RESULT_ERROR_INVALID_ARGUMENT); + ze_ipc_data_t *zeIpcData = (ze_ipc_data_t *)IpcData; + auto Ret = ZE_CALL_NOCHECK(zeMemGetIpcHandle, + (Context->ZeContext, Ptr, &zeIpcData->zeHandle)); + if (Ret != ZE_RESULT_SUCCESS) { + return UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC; + } + + zeIpcData->pid = ur_getpid(); + + return UMF_RESULT_SUCCESS; +} + +umf_result_t L0MemoryProvider::put_ipc_handle(void *IpcData) { + UR_ASSERT(IpcData, UMF_RESULT_ERROR_INVALID_ARGUMENT); + ze_ipc_data_t *zeIpcData = (ze_ipc_data_t *)IpcData; + std::ignore = zeIpcData; + + // zeMemPutIpcHandle was introduced in Level Zero 1.6. Before Level Zero 1.6, + // IPC handle was released automatically when corresponding memory buffer + // was freed. +#if (ZE_API_VERSION_CURRENT >= ZE_MAKE_VERSION(1, 6)) + auto Ret = ZE_CALL_NOCHECK(zeMemPutIpcHandle, + (Context->ZeContext, zeIpcData->zeHandle)); + if (Ret != ZE_RESULT_SUCCESS) { + return UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC; + } +#endif + + return UMF_RESULT_SUCCESS; +} + +umf_result_t L0MemoryProvider::open_ipc_handle(void *IpcData, void **Ptr) { + UR_ASSERT(IpcData && Ptr, UMF_RESULT_ERROR_INVALID_ARGUMENT); + ze_ipc_data_t *zeIpcData = (ze_ipc_data_t *)IpcData; + + int fdLocal = -1; + if (zeIpcData->pid != ur_getpid()) { + int fdRemote = -1; + memcpy(&fdRemote, &zeIpcData->zeHandle, sizeof(fdRemote)); + fdLocal = ur_duplicate_fd(zeIpcData->pid, fdRemote); + if (fdLocal == -1) { + logger::error("duplicating file descriptor from IPC handle failed"); + return UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC; + } + + memcpy(&zeIpcData->zeHandle, &fdLocal, sizeof(fdLocal)); + } + + auto Ret = + ZE_CALL_NOCHECK(zeMemOpenIpcHandle, (Context->ZeContext, Device->ZeDevice, + zeIpcData->zeHandle, 0, Ptr)); + if (fdLocal != -1) { + ur_close_fd(fdLocal); + } + + if (Ret != ZE_RESULT_SUCCESS) { + return UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC; + } + + return UMF_RESULT_SUCCESS; +} + +umf_result_t L0MemoryProvider::close_ipc_handle(void *Ptr, size_t Size) { + std::ignore = Size; + + UR_ASSERT(Ptr, UMF_RESULT_ERROR_INVALID_ARGUMENT); + auto Ret = ZE_CALL_NOCHECK(zeMemCloseIpcHandle, (Context->ZeContext, Ptr)); + if (Ret != ZE_RESULT_SUCCESS) { + return UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC; + } + + return UMF_RESULT_SUCCESS; +} + ur_result_t L0SharedMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, uint32_t Alignment) { return USMSharedAllocImpl(ResultPtr, Context, Device, /*host flags*/ 0, @@ -827,7 +1053,7 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, HostMemPool = umf::poolMakeUniqueFromOps( - &UMF_DISJOINT_POOL_OPS, std::move(MemProvider), + umfDisjointPoolOps(), std::move(MemProvider), &this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Host]) .second; @@ -838,7 +1064,7 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, DeviceMemPools.emplace( std::piecewise_construct, std::make_tuple(device), std::make_tuple(umf::poolMakeUniqueFromOps( - &UMF_DISJOINT_POOL_OPS, std::move(MemProvider), + umfDisjointPoolOps(), std::move(MemProvider), &this->DisjointPoolConfigs .Configs[usm::DisjointPoolMemType::Device]) .second)); @@ -849,7 +1075,7 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, SharedMemPools.emplace( std::piecewise_construct, std::make_tuple(device), std::make_tuple(umf::poolMakeUniqueFromOps( - &UMF_DISJOINT_POOL_OPS, std::move(MemProvider), + umfDisjointPoolOps(), std::move(MemProvider), &this->DisjointPoolConfigs .Configs[usm::DisjointPoolMemType::Shared]) .second)); @@ -861,75 +1087,13 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, std::piecewise_construct, std::make_tuple(device), std::make_tuple( umf::poolMakeUniqueFromOps( - &UMF_DISJOINT_POOL_OPS, std::move(MemProvider), + umfDisjointPoolOps(), std::move(MemProvider), &this->DisjointPoolConfigs .Configs[usm::DisjointPoolMemType::SharedReadOnly]) .second)); } } -UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolCreate( - ur_context_handle_t Context, ///< [in] handle of the context object - ur_usm_pool_desc_t - *PoolDesc, ///< [in] pointer to USM pool descriptor. Can be chained with - ///< ::ur_usm_pool_limits_desc_t - ur_usm_pool_handle_t *Pool ///< [out] pointer to USM memory pool -) { - - try { - *Pool = reinterpret_cast( - new ur_usm_pool_handle_t_(Context, PoolDesc)); - - std::shared_lock ContextLock(Context->Mutex); - Context->UsmPoolHandles.insert(Context->UsmPoolHandles.cend(), *Pool); - - } catch (const UsmAllocationException &Ex) { - return Ex.getError(); - } - return UR_RESULT_SUCCESS; -} - -ur_result_t -urUSMPoolRetain(ur_usm_pool_handle_t Pool ///< [in] pointer to USM memory pool -) { - Pool->RefCount.increment(); - return UR_RESULT_SUCCESS; -} - -ur_result_t -urUSMPoolRelease(ur_usm_pool_handle_t Pool ///< [in] pointer to USM memory pool -) { - if (Pool->RefCount.decrementAndTest()) { - std::shared_lock ContextLock(Pool->Context->Mutex); - Pool->Context->UsmPoolHandles.remove(Pool); - delete Pool; - } - return UR_RESULT_SUCCESS; -} - -ur_result_t urUSMPoolGetInfo( - ur_usm_pool_handle_t Pool, ///< [in] handle of the USM memory pool - ur_usm_pool_info_t PropName, ///< [in] name of the pool property to query - size_t PropSize, ///< [in] size in bytes of the pool property value provided - void *PropValue, ///< [out][typename(propName, propSize)] value of the pool - ///< property - size_t *PropSizeRet ///< [out] size in bytes returned in pool property value -) { - UrReturnHelper ReturnValue(PropSize, PropValue, PropSizeRet); - - switch (PropName) { - case UR_USM_POOL_INFO_REFERENCE_COUNT: { - return ReturnValue(Pool->RefCount.load()); - } - case UR_USM_POOL_INFO_CONTEXT: { - return ReturnValue(Pool->Context); - } - default: { - return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; - } - } -} - // If indirect access tracking is not enabled then this functions just performs // zeMemFree. If indirect access tracking is enabled then reference counting is // performed. @@ -1012,38 +1176,3 @@ ur_result_t USMFreeHelper(ur_context_handle_t Context, void *Ptr, UR_CALL(ContextReleaseHelper(Context)); return umf2urResult(umfRet); } - -UR_APIEXPORT ur_result_t UR_APICALL urUSMImportExp(ur_context_handle_t Context, - void *HostPtr, size_t Size) { - UR_ASSERT(Context, UR_RESULT_ERROR_INVALID_CONTEXT); - - // Promote the host ptr to USM host memory. - if (ZeUSMImport.Supported && HostPtr != nullptr) { - // Query memory type of the host pointer - ze_device_handle_t ZeDeviceHandle; - ZeStruct ZeMemoryAllocationProperties; - ZE2UR_CALL(zeMemGetAllocProperties, - (Context->ZeContext, HostPtr, &ZeMemoryAllocationProperties, - &ZeDeviceHandle)); - - // If not shared of any type, we can import the ptr - if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN) { - // Promote the host ptr to USM host memory - ze_driver_handle_t driverHandle = - Context->getPlatform()->ZeDriverHandleExpTranslated; - ZeUSMImport.doZeUSMImport(driverHandle, HostPtr, Size); - } - } - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urUSMReleaseExp(ur_context_handle_t Context, - void *HostPtr) { - UR_ASSERT(Context, UR_RESULT_ERROR_INVALID_CONTEXT); - - // Release the imported memory. - if (ZeUSMImport.Supported && HostPtr != nullptr) - ZeUSMImport.doZeUSMRelease( - Context->getPlatform()->ZeDriverHandleExpTranslated, HostPtr); - return UR_RESULT_SUCCESS; -} diff --git a/source/adapters/level_zero/usm.hpp b/source/adapters/level_zero/usm.hpp index 6d61e43ee8..2fe74a5ecf 100644 --- a/source/adapters/level_zero/usm.hpp +++ b/source/adapters/level_zero/usm.hpp @@ -85,6 +85,27 @@ class USMMemoryProviderBase { virtual umf_result_t purge_force(void *, size_t) { return UMF_RESULT_ERROR_NOT_SUPPORTED; }; + virtual umf_result_t allocation_merge(void *, void *, size_t) { + return UMF_RESULT_ERROR_NOT_SUPPORTED; + } + virtual umf_result_t allocation_split(void *, size_t, size_t) { + return UMF_RESULT_ERROR_NOT_SUPPORTED; + } + virtual umf_result_t get_ipc_handle_size(size_t *) { + return UMF_RESULT_ERROR_NOT_SUPPORTED; + } + virtual umf_result_t get_ipc_handle(const void *, size_t, void *) { + return UMF_RESULT_ERROR_NOT_SUPPORTED; + } + virtual umf_result_t put_ipc_handle(void *) { + return UMF_RESULT_ERROR_NOT_SUPPORTED; + } + virtual umf_result_t open_ipc_handle(void *, void **) { + return UMF_RESULT_ERROR_NOT_SUPPORTED; + } + virtual umf_result_t close_ipc_handle(void *, size_t) { + return UMF_RESULT_ERROR_NOT_SUPPORTED; + } virtual const char *get_name() { return ""; }; virtual ~USMMemoryProviderBase() = default; }; @@ -105,6 +126,11 @@ class L0MemoryProvider : public USMMemoryProviderBase { umf_result_t get_min_page_size(void *, size_t *) override; // TODO: Different name for each provider (Host/Shared/SharedRO/Device) const char *get_name() override { return "L0"; }; + umf_result_t get_ipc_handle_size(size_t *) override; + umf_result_t get_ipc_handle(const void *, size_t, void *) override; + umf_result_t put_ipc_handle(void *) override; + umf_result_t open_ipc_handle(void *, void **) override; + umf_result_t close_ipc_handle(void *, size_t) override; }; // Allocation routines for shared memory type diff --git a/source/adapters/level_zero/usm_p2p.cpp b/source/adapters/level_zero/usm_p2p.cpp index 2b81828423..6e701aa803 100644 --- a/source/adapters/level_zero/usm_p2p.cpp +++ b/source/adapters/level_zero/usm_p2p.cpp @@ -11,8 +11,10 @@ #include "logger/ur_logger.hpp" #include "ur_level_zero.hpp" -UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PEnablePeerAccessExp( - ur_device_handle_t commandDevice, ur_device_handle_t peerDevice) { +namespace ur::level_zero { + +ur_result_t urUsmP2PEnablePeerAccessExp(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice) { std::ignore = commandDevice; std::ignore = peerDevice; @@ -21,8 +23,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PEnablePeerAccessExp( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PDisablePeerAccessExp( - ur_device_handle_t commandDevice, ur_device_handle_t peerDevice) { +ur_result_t urUsmP2PDisablePeerAccessExp(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice) { std::ignore = commandDevice; std::ignore = peerDevice; @@ -31,10 +33,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PDisablePeerAccessExp( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( - ur_device_handle_t commandDevice, ur_device_handle_t peerDevice, - ur_exp_peer_info_t propName, size_t propSize, void *pPropValue, - size_t *pPropSizeRet) { +ur_result_t urUsmP2PPeerAccessGetInfoExp(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice, + ur_exp_peer_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet) { UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); @@ -69,3 +72,4 @@ UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return ReturnValue(propertyValue); } +} // namespace ur::level_zero diff --git a/source/adapters/level_zero/v2/README.md b/source/adapters/level_zero/v2/README.md index b75788ce38..27331e4777 100644 --- a/source/adapters/level_zero/v2/README.md +++ b/source/adapters/level_zero/v2/README.md @@ -2,13 +2,14 @@ This is the home directory for L0 v2 adapter sources. This is a redesigned version of the L0 adapter that focuses on maximizing the performance of each queue mode individually (immediate/batched, in-order/out-of-order). -L0 v2 adapter can be enabled by setting `UR_L0_USE_QUEUE_V2=1` env variable. If the variable is not set, legacy path will be used. +L0 v2 adapter can be enabled by setting passing `UR_BUILD_ADAPTER_L0_V2=1` option to cmake. When enabled, `libur_adapter_level_zero_v2.[so|dll]` will be created. # Code structure -v2 adapter only rewrites certain functions (mostly urEnqueue* functions) while reusing the rest. `ur_queue_handle_t` has become an abstract class and each enqueue function a virtual function. +v2 adapters is is a standalone adapter but reuses some logic from the legacy L0 adapter implementation - most notably: adapter.cpp, platform.cpp, device.cpp -Legacy enqeue path is implemented in `ur_queue_handle_legacy_t` which inherits from `ur_queue_handle_t`. For new, optimized path, each queue mode will be implemented as a separate queue class (e.g. `v2::ur_queue_immediate_in_order_t`) inheriting from `ur_queue_handle_t`. +Each queue mode will be implemented as a separate queue class (e.g. `v2::ur_queue_immediate_in_order_t`) inheriting from `ur_queue_handle_t` which is an abstract class +in v2 adapter. `ur_queue_handle_t` is auto-generated by `make generate-code` - for every API function that accepts `ur_queue_handle_t` as a first parameter, new pure virtual method is created. The API function is then -auto-implemented (see ../queue_api.cpp) by dispatching to that virtual method. Developer is only responsbile for implementing that virtual function for every queue base class. +auto-implemented (see ./queue_api.cpp) by dispatching to that virtual method. Developer is only responsbile for implementing that virtual function for every queue base class. diff --git a/source/adapters/level_zero/v2/api.cpp b/source/adapters/level_zero/v2/api.cpp new file mode 100644 index 0000000000..8515997f35 --- /dev/null +++ b/source/adapters/level_zero/v2/api.cpp @@ -0,0 +1,652 @@ +//===--------- api.cpp - Level Zero Adapter ------------------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include "../common.hpp" +#include "logger/ur_logger.hpp" + +std::mutex ZeCall::GlobalLock; + +namespace ur::level_zero { +ur_result_t urContextGetNativeHandle(ur_context_handle_t hContext, + ur_native_handle_t *phNativeContext) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urContextCreateWithNativeHandle( + ur_native_handle_t hNativeContext, ur_adapter_handle_t hAdapter, + uint32_t numDevices, const ur_device_handle_t *phDevices, + const ur_context_native_properties_t *pProperties, + ur_context_handle_t *phContext) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t +urContextSetExtendedDeleter(ur_context_handle_t hContext, + ur_context_extended_deleter_t pfnDeleter, + void *pUserData) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urMemImageCreate(ur_context_handle_t hContext, ur_mem_flags_t flags, + const ur_image_format_t *pImageFormat, + const ur_image_desc_t *pImageDesc, void *pHost, + ur_mem_handle_t *phMem) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urMemGetNativeHandle(ur_mem_handle_t hMem, + ur_device_handle_t hDevice, + ur_native_handle_t *phNativeMem) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urMemImageCreateWithNativeHandle( + ur_native_handle_t hNativeMem, ur_context_handle_t hContext, + const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, + const ur_mem_native_properties_t *pProperties, ur_mem_handle_t *phMem) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urMemImageGetInfo(ur_mem_handle_t hMemory, ur_image_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urSamplerCreate(ur_context_handle_t hContext, + const ur_sampler_desc_t *pDesc, + ur_sampler_handle_t *phSampler) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urSamplerRetain(ur_sampler_handle_t hSampler) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urSamplerRelease(ur_sampler_handle_t hSampler) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urSamplerGetInfo(ur_sampler_handle_t hSampler, + ur_sampler_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urSamplerGetNativeHandle(ur_sampler_handle_t hSampler, + ur_native_handle_t *phNativeSampler) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urSamplerCreateWithNativeHandle( + ur_native_handle_t hNativeSampler, ur_context_handle_t hContext, + const ur_sampler_native_properties_t *pProperties, + ur_sampler_handle_t *phSampler) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urVirtualMemGranularityGetInfo( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_virtual_mem_granularity_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urVirtualMemReserve(ur_context_handle_t hContext, + const void *pStart, size_t size, + void **ppStart) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urVirtualMemFree(ur_context_handle_t hContext, const void *pStart, + size_t size) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urVirtualMemMap(ur_context_handle_t hContext, const void *pStart, + size_t size, ur_physical_mem_handle_t hPhysicalMem, + size_t offset, + ur_virtual_mem_access_flags_t flags) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urVirtualMemUnmap(ur_context_handle_t hContext, const void *pStart, + size_t size) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urVirtualMemSetAccess(ur_context_handle_t hContext, + const void *pStart, size_t size, + ur_virtual_mem_access_flags_t flags) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urVirtualMemGetInfo(ur_context_handle_t hContext, + const void *pStart, size_t size, + ur_virtual_mem_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urPhysicalMemCreate(ur_context_handle_t hContext, + ur_device_handle_t hDevice, size_t size, + const ur_physical_mem_properties_t *pProperties, + ur_physical_mem_handle_t *phPhysicalMem) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urPhysicalMemRetain(ur_physical_mem_handle_t hPhysicalMem) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urPhysicalMemRelease(ur_physical_mem_handle_t hPhysicalMem) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urKernelGetInfo(ur_kernel_handle_t hKernel, + ur_kernel_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t +urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex, + const ur_kernel_arg_sampler_properties_t *pProperties, + ur_sampler_handle_t hArgValue) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urKernelSetSpecializationConstants( + ur_kernel_handle_t hKernel, uint32_t count, + const ur_specialization_constant_info_t *pSpecConstants) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urKernelGetNativeHandle(ur_kernel_handle_t hKernel, + ur_native_handle_t *phNativeKernel) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t +urKernelCreateWithNativeHandle(ur_native_handle_t hNativeKernel, + ur_context_handle_t hContext, + ur_program_handle_t hProgram, + const ur_kernel_native_properties_t *pProperties, + ur_kernel_handle_t *phKernel) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urKernelGetSuggestedLocalWorkSize(ur_kernel_handle_t hKernel, + ur_queue_handle_t hQueue, + uint32_t numWorkDim, + const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, + size_t *pSuggestedLocalWorkSize) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urEventGetNativeHandle(ur_event_handle_t hEvent, + ur_native_handle_t *phNativeEvent) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t +urEventCreateWithNativeHandle(ur_native_handle_t hNativeEvent, + ur_context_handle_t hContext, + const ur_event_native_properties_t *pProperties, + ur_event_handle_t *phEvent) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urEventSetCallback(ur_event_handle_t hEvent, + ur_execution_info_t execStatus, + ur_event_callback_t pfnNotify, void *pUserData) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urUSMPitchedAllocExp(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + const ur_usm_desc_t *pUSMDesc, + ur_usm_pool_handle_t pool, size_t widthInBytes, + size_t height, size_t elementSizeBytes, + void **ppMem, size_t *pResultPitch) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urBindlessImagesUnsampledImageHandleDestroyExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_image_native_handle_t hImage) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urBindlessImagesSampledImageHandleDestroyExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_image_native_handle_t hImage) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urBindlessImagesImageAllocateExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, + ur_exp_image_mem_native_handle_t *phImageMem) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t +urBindlessImagesImageFreeExp(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + ur_exp_image_mem_native_handle_t hImageMem) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urBindlessImagesUnsampledImageCreateExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_image_mem_native_handle_t hImageMem, + const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, + ur_exp_image_native_handle_t *phImage) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urBindlessImagesSampledImageCreateExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_image_mem_native_handle_t hImageMem, + const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, + ur_sampler_handle_t hSampler, ur_exp_image_native_handle_t *phImage) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urBindlessImagesImageGetInfoExp( + ur_context_handle_t hContext, ur_exp_image_mem_native_handle_t hImageMem, + ur_image_info_t propName, void *pPropValue, size_t *pPropSizeRet) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urBindlessImagesMipmapGetLevelExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_image_mem_native_handle_t hImageMem, uint32_t mipmapLevel, + ur_exp_image_mem_native_handle_t *phImageMem) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t +urBindlessImagesMipmapFreeExp(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + ur_exp_image_mem_native_handle_t hMem) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urBindlessImagesImportExternalMemoryExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size, + ur_exp_external_mem_type_t memHandleType, + ur_exp_external_mem_desc_t *pExternalMemDesc, + ur_exp_external_mem_handle_t *phExternalMem) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urBindlessImagesMapExternalArrayExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, + ur_exp_external_mem_handle_t hExternalMem, + ur_exp_image_mem_native_handle_t *phImageMem) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urBindlessImagesMapExternalLinearMemoryExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, uint64_t offset, + uint64_t size, ur_exp_external_mem_handle_t hExternalMem, void **ppRetMem) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urBindlessImagesReleaseExternalMemoryExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_external_mem_handle_t hExternalMem) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urBindlessImagesImportExternalSemaphoreExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_external_semaphore_type_t semHandleType, + ur_exp_external_semaphore_desc_t *pExternalSemaphoreDesc, + ur_exp_external_semaphore_handle_t *phExternalSemaphore) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urBindlessImagesReleaseExternalSemaphoreExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_external_semaphore_handle_t hExternalSemaphore) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t +urCommandBufferCreateExp(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + const ur_exp_command_buffer_desc_t *pCommandBufferDesc, + ur_exp_command_buffer_handle_t *phCommandBuffer) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t +urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t +urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t +urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urCommandBufferAppendKernelLaunchExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_kernel_handle_t hKernel, + uint32_t workDim, const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numKernelAlternatives, ur_kernel_handle_t *phKernelAlternatives, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urCommandBufferAppendUSMMemcpyExp( + ur_exp_command_buffer_handle_t hCommandBuffer, void *pDst, const void *pSrc, + size_t size, uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urCommandBufferAppendUSMFillExp( + ur_exp_command_buffer_handle_t hCommandBuffer, void *pMemory, + const void *pPattern, size_t patternSize, size_t size, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urCommandBufferAppendMemBufferCopyExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hSrcMem, + ur_mem_handle_t hDstMem, size_t srcOffset, size_t dstOffset, size_t size, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urCommandBufferAppendMemBufferWriteExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + size_t offset, size_t size, const void *pSrc, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urCommandBufferAppendMemBufferReadExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + size_t offset, size_t size, void *pDst, uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urCommandBufferAppendMemBufferCopyRectExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hSrcMem, + ur_mem_handle_t hDstMem, ur_rect_offset_t srcOrigin, + ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, + size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urCommandBufferAppendMemBufferWriteRectExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + ur_rect_offset_t bufferOffset, ur_rect_offset_t hostOffset, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urCommandBufferAppendMemBufferReadRectExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + ur_rect_offset_t bufferOffset, ur_rect_offset_t hostOffset, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pDst, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urCommandBufferAppendMemBufferFillExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + const void *pPattern, size_t patternSize, size_t offset, size_t size, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urCommandBufferAppendUSMPrefetchExp( + ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory, + size_t size, ur_usm_migration_flags_t flags, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urCommandBufferAppendUSMAdviseExp( + ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory, + size_t size, ur_usm_advice_flags_t advice, uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urCommandBufferEnqueueExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urCommandBufferRetainCommandExp( + ur_exp_command_buffer_command_handle_t hCommand) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urCommandBufferReleaseCommandExp( + ur_exp_command_buffer_command_handle_t hCommand) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urCommandBufferUpdateKernelLaunchExp( + ur_exp_command_buffer_command_handle_t hCommand, + const ur_exp_command_buffer_update_kernel_launch_desc_t + *pUpdateKernelLaunch) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t +urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer, + ur_exp_command_buffer_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urCommandBufferUpdateSignalEventExp( + ur_exp_command_buffer_command_handle_t hCommand, + ur_event_handle_t *phEvent) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urCommandBufferUpdateWaitEventsExp( + ur_exp_command_buffer_command_handle_t hCommand, + uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urCommandBufferCommandGetInfoExp( + ur_exp_command_buffer_command_handle_t hCommand, + ur_exp_command_buffer_command_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urKernelSuggestMaxCooperativeGroupCountExp( + ur_kernel_handle_t hKernel, size_t localWorkSize, + size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urUSMImportExp(ur_context_handle_t hContext, void *pMem, + size_t size) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urUSMReleaseExp(ur_context_handle_t hContext, void *pMem) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urUsmP2PEnablePeerAccessExp(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urUsmP2PDisablePeerAccessExp(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urUsmP2PPeerAccessGetInfoExp(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice, + ur_exp_peer_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} +} // namespace ur::level_zero diff --git a/source/adapters/level_zero/v2/command_list_cache.cpp b/source/adapters/level_zero/v2/command_list_cache.cpp index c240cc8ee7..651cb5944a 100644 --- a/source/adapters/level_zero/v2/command_list_cache.cpp +++ b/source/adapters/level_zero/v2/command_list_cache.cpp @@ -9,8 +9,8 @@ //===----------------------------------------------------------------------===// #include "command_list_cache.hpp" +#include "context.hpp" -#include "../context.hpp" #include "../device.hpp" bool v2::immediate_command_list_descriptor_t::operator==( @@ -43,7 +43,7 @@ inline size_t command_list_descriptor_hash_t::operator()( command_list_cache_t::command_list_cache_t(ze_context_handle_t ZeContext) : ZeContext{ZeContext} {} -raii::ze_command_list_t +raii::ze_command_list_handle_t command_list_cache_t::createCommandList(const command_list_descriptor_t &desc) { if (auto ImmCmdDesc = std::get_if(&desc)) { @@ -61,7 +61,7 @@ command_list_cache_t::createCommandList(const command_list_descriptor_t &desc) { ZE2UR_CALL_THROWS( zeCommandListCreateImmediate, (ZeContext, ImmCmdDesc->ZeDevice, &QueueDesc, &ZeCommandList)); - return raii::ze_command_list_t(ZeCommandList, &zeCommandListDestroy); + return raii::ze_command_list_handle_t(ZeCommandList); } else { auto RegCmdDesc = std::get(desc); ZeStruct CmdListDesc; @@ -72,7 +72,7 @@ command_list_cache_t::createCommandList(const command_list_descriptor_t &desc) { ze_command_list_handle_t ZeCommandList; ZE2UR_CALL_THROWS(zeCommandListCreate, (ZeContext, RegCmdDesc.ZeDevice, &CmdListDesc, &ZeCommandList)); - return raii::ze_command_list_t(ZeCommandList, &zeCommandListDestroy); + return raii::ze_command_list_handle_t(ZeCommandList); } } @@ -81,6 +81,8 @@ command_list_cache_t::getImmediateCommandList( ze_device_handle_t ZeDevice, bool IsInOrder, uint32_t Ordinal, ze_command_queue_mode_t Mode, ze_command_queue_priority_t Priority, std::optional Index) { + TRACK_SCOPE_LATENCY("command_list_cache_t::getImmediateCommandList"); + immediate_command_list_descriptor_t Desc; Desc.ZeDevice = ZeDevice; Desc.Ordinal = Ordinal; @@ -92,28 +94,29 @@ command_list_cache_t::getImmediateCommandList( auto CommandList = getCommandList(Desc).release(); return raii::cache_borrowed_command_list_t( CommandList, [Cache = this, Desc](ze_command_list_handle_t CmdList) { - Cache->addCommandList( - Desc, raii::ze_command_list_t(CmdList, &zeCommandListDestroy)); + Cache->addCommandList(Desc, raii::ze_command_list_handle_t(CmdList)); }); } raii::cache_borrowed_command_list_t command_list_cache_t::getRegularCommandList(ze_device_handle_t ZeDevice, bool IsInOrder, uint32_t Ordinal) { + TRACK_SCOPE_LATENCY("command_list_cache_t::getRegularCommandList"); + regular_command_list_descriptor_t Desc; Desc.ZeDevice = ZeDevice; Desc.IsInOrder = IsInOrder; Desc.Ordinal = Ordinal; auto CommandList = getCommandList(Desc).release(); + return raii::cache_borrowed_command_list_t( CommandList, [Cache = this, Desc](ze_command_list_handle_t CmdList) { - Cache->addCommandList( - Desc, raii::ze_command_list_t(CmdList, &zeCommandListDestroy)); + Cache->addCommandList(Desc, raii::ze_command_list_handle_t(CmdList)); }); } -raii::ze_command_list_t +raii::ze_command_list_handle_t command_list_cache_t::getCommandList(const command_list_descriptor_t &desc) { std::unique_lock Lock(ZeCommandListCacheMutex); auto it = ZeCommandListCache.find(desc); @@ -124,7 +127,8 @@ command_list_cache_t::getCommandList(const command_list_descriptor_t &desc) { assert(!it->second.empty()); - raii::ze_command_list_t CommandListHandle = std::move(it->second.top()); + raii::ze_command_list_handle_t CommandListHandle = + std::move(it->second.top()); it->second.pop(); if (it->second.empty()) @@ -133,8 +137,9 @@ command_list_cache_t::getCommandList(const command_list_descriptor_t &desc) { return CommandListHandle; } -void command_list_cache_t::addCommandList(const command_list_descriptor_t &desc, - raii::ze_command_list_t cmdList) { +void command_list_cache_t::addCommandList( + const command_list_descriptor_t &desc, + raii::ze_command_list_handle_t cmdList) { // TODO: add a limit? std::unique_lock Lock(ZeCommandListCacheMutex); auto [it, _] = ZeCommandListCache.try_emplace(desc); diff --git a/source/adapters/level_zero/v2/command_list_cache.hpp b/source/adapters/level_zero/v2/command_list_cache.hpp index 7cacf40604..9884e16dc4 100644 --- a/source/adapters/level_zero/v2/command_list_cache.hpp +++ b/source/adapters/level_zero/v2/command_list_cache.hpp @@ -12,19 +12,18 @@ #include #include +#include "latency_tracker.hpp" #include -#include +#include #include -#include "../common.hpp" +#include "common.hpp" namespace v2 { namespace raii { -using ze_command_list_t = std::unique_ptr<::_ze_command_list_handle_t, - decltype(&zeCommandListDestroy)>; using cache_borrowed_command_list_t = std::unique_ptr<::_ze_command_list_handle_t, - std::function>; + std::function>; } // namespace raii struct immediate_command_list_descriptor_t { @@ -71,15 +70,16 @@ struct command_list_cache_t { private: ze_context_handle_t ZeContext; std::unordered_map, + std::stack, command_list_descriptor_hash_t> ZeCommandListCache; ur_mutex ZeCommandListCacheMutex; - raii::ze_command_list_t getCommandList(const command_list_descriptor_t &desc); + raii::ze_command_list_handle_t + getCommandList(const command_list_descriptor_t &desc); void addCommandList(const command_list_descriptor_t &desc, - raii::ze_command_list_t cmdList); - raii::ze_command_list_t + raii::ze_command_list_handle_t cmdList); + raii::ze_command_list_handle_t createCommandList(const command_list_descriptor_t &desc); }; } // namespace v2 diff --git a/source/adapters/level_zero/v2/common.hpp b/source/adapters/level_zero/v2/common.hpp new file mode 100644 index 0000000000..4fb851bad8 --- /dev/null +++ b/source/adapters/level_zero/v2/common.hpp @@ -0,0 +1,106 @@ +//===--------- common.hpp - Level Zero Adapter ---------------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +#include "../common.hpp" +#include "logger/ur_logger.hpp" + +namespace v2 { + +namespace raii { + +template +struct ze_handle_wrapper { + ze_handle_wrapper(bool ownZeHandle = true) + : handle(nullptr), ownZeHandle(ownZeHandle) {} + + ze_handle_wrapper(ZeHandleT handle, bool ownZeHandle = true) + : handle(handle), ownZeHandle(ownZeHandle) {} + + ze_handle_wrapper(const ze_handle_wrapper &) = delete; + ze_handle_wrapper &operator=(const ze_handle_wrapper &) = delete; + + ze_handle_wrapper(ze_handle_wrapper &&other) + : handle(other.handle), ownZeHandle(other.ownZeHandle) { + other.handle = nullptr; + } + + ze_handle_wrapper &operator=(ze_handle_wrapper &&other) { + if (this == &other) { + return *this; + } + + if (handle) { + reset(); + } + handle = other.handle; + ownZeHandle = other.ownZeHandle; + other.handle = nullptr; + return *this; + } + + ~ze_handle_wrapper() { + try { + reset(); + } catch (...) { + // TODO: add appropriate logging or pass the error + // to the caller (make the dtor noexcept(false) or use tls?) + } + } + + void reset() { + if (!handle) { + return; + } + + auto zeResult = ZE_CALL_NOCHECK(destroy, (handle)); + // Gracefully handle the case that L0 was already unloaded. + if (zeResult && zeResult != ZE_RESULT_ERROR_UNINITIALIZED) + throw ze2urResult(zeResult); + + handle = nullptr; + } + + ZeHandleT release() { + auto handle = this->handle; + this->handle = nullptr; + return handle; + } + + ZeHandleT get() const { return handle; } + + ZeHandleT *ptr() { return &handle; } + +private: + ZeHandleT handle; + bool ownZeHandle; +}; + +using ze_kernel_handle_t = + ze_handle_wrapper<::ze_kernel_handle_t, zeKernelDestroy>; + +using ze_event_handle_t = + ze_handle_wrapper<::ze_event_handle_t, zeEventDestroy>; + +using ze_event_pool_handle_t = + ze_handle_wrapper<::ze_event_pool_handle_t, zeEventPoolDestroy>; + +using ze_context_handle_t = + ze_handle_wrapper<::ze_context_handle_t, zeContextDestroy>; + +using ze_command_list_handle_t = + ze_handle_wrapper<::ze_command_list_handle_t, zeCommandListDestroy>; + +} // namespace raii +} // namespace v2 diff --git a/source/adapters/level_zero/v2/context.cpp b/source/adapters/level_zero/v2/context.cpp index cedc4fcd5d..de9805817b 100644 --- a/source/adapters/level_zero/v2/context.cpp +++ b/source/adapters/level_zero/v2/context.cpp @@ -8,15 +8,151 @@ // //===----------------------------------------------------------------------===// +#include "../device.hpp" + #include "context.hpp" +#include "event_provider_normal.hpp" + +static std::vector +filterP2PDevices(ur_device_handle_t hSourceDevice, + const std::vector &devices) { + std::vector p2pDevices; + for (auto &device : devices) { + if (device == hSourceDevice) { + continue; + } + + ze_bool_t p2p; + ZE2UR_CALL_THROWS(zeDeviceCanAccessPeer, + (hSourceDevice->ZeDevice, device->ZeDevice, &p2p)); -namespace v2 { + if (p2p) { + p2pDevices.push_back(device); + } + } + return p2pDevices; +} + +static std::vector> +populateP2PDevices(size_t maxDevices, + const std::vector &devices) { + std::vector> p2pDevices(maxDevices); + for (auto &device : devices) { + p2pDevices[device->Id.value()] = filterP2PDevices(device, devices); + } + return p2pDevices; +} ur_context_handle_t_::ur_context_handle_t_(ze_context_handle_t hContext, uint32_t numDevices, const ur_device_handle_t *phDevices, bool ownZeContext) - : ::ur_context_handle_t_(hContext, numDevices, phDevices, ownZeContext), - commandListCache(hContext) {} + : commandListCache(hContext), + eventPoolCache(phDevices[0]->Platform->getNumDevices(), + [context = this, platform = phDevices[0]->Platform]( + DeviceId deviceId, v2::event_flags_t flags) { + auto device = platform->getDeviceById(deviceId); + // TODO: just use per-context id? + return std::make_unique( + context, device, v2::QUEUE_IMMEDIATE, flags); + }), + hContext(hContext, ownZeContext), + hDevices(phDevices, phDevices + numDevices), + p2pAccessDevices(populateP2PDevices( + phDevices[0]->Platform->getNumDevices(), this->hDevices)), + defaultUSMPool(this, nullptr) {} + +ur_result_t ur_context_handle_t_::retain() { + RefCount.increment(); + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_context_handle_t_::release() { + if (!RefCount.decrementAndTest()) + return UR_RESULT_SUCCESS; + + delete this; + return UR_RESULT_SUCCESS; +} + +ur_platform_handle_t ur_context_handle_t_::getPlatform() const { + return hDevices[0]->Platform; +} + +const std::vector & +ur_context_handle_t_::getDevices() const { + return hDevices; +} + +bool ur_context_handle_t_::isValidDevice(ur_device_handle_t hDevice) const { + while (hDevice) { + if (std::find(hDevices.begin(), hDevices.end(), hDevice) != hDevices.end()) + return true; + hDevice = hDevice->RootDevice; + } + return false; +} + +ur_usm_pool_handle_t ur_context_handle_t_::getDefaultUSMPool() { + return &defaultUSMPool; +} + +const std::vector & +ur_context_handle_t_::getP2PDevices(ur_device_handle_t hDevice) const { + return p2pAccessDevices[hDevice->Id.value()]; +} + +namespace ur::level_zero { +ur_result_t urContextCreate(uint32_t deviceCount, + const ur_device_handle_t *phDevices, + const ur_context_properties_t *pProperties, + ur_context_handle_t *phContext) { + std::ignore = pProperties; + + ur_platform_handle_t hPlatform = phDevices[0]->Platform; + ZeStruct contextDesc{}; + + ze_context_handle_t zeContext{}; + ZE2UR_CALL(zeContextCreate, (hPlatform->ZeDriver, &contextDesc, &zeContext)); + + *phContext = + new ur_context_handle_t_(zeContext, deviceCount, phDevices, true); + return UR_RESULT_SUCCESS; +} + +ur_result_t urContextRetain(ur_context_handle_t hContext) { + return hContext->retain(); +} + +ur_result_t urContextRelease(ur_context_handle_t hContext) { + return hContext->release(); +} + +ur_result_t urContextGetInfo(ur_context_handle_t hContext, + ur_context_info_t contextInfoType, size_t propSize, + + void *pContextInfo, -} // namespace v2 + size_t *pPropSizeRet) { + std::shared_lock Lock(hContext->Mutex); + UrReturnHelper ReturnValue(propSize, pContextInfo, pPropSizeRet); + switch ( + (uint32_t)contextInfoType) { // cast to avoid warnings on EXT enum values + case UR_CONTEXT_INFO_DEVICES: + return ReturnValue(hContext->getDevices().data(), + hContext->getDevices().size()); + case UR_CONTEXT_INFO_NUM_DEVICES: + return ReturnValue(uint32_t(hContext->getDevices().size())); + case UR_CONTEXT_INFO_REFERENCE_COUNT: + return ReturnValue(uint32_t{hContext->RefCount.load()}); + case UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT: + // TODO: this is currently not implemented + return ReturnValue(uint8_t{false}); + case UR_CONTEXT_INFO_USM_FILL2D_SUPPORT: + // 2D USM fill is not supported. + return ReturnValue(uint8_t{false}); + default: + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } +} +} // namespace ur::level_zero diff --git a/source/adapters/level_zero/v2/context.hpp b/source/adapters/level_zero/v2/context.hpp index b518036542..b3ba6050dd 100644 --- a/source/adapters/level_zero/v2/context.hpp +++ b/source/adapters/level_zero/v2/context.hpp @@ -10,20 +10,40 @@ #pragma once +#include + #include "command_list_cache.hpp" +#include "common.hpp" +#include "event_pool_cache.hpp" +#include "usm.hpp" -#include "../context.hpp" +struct ur_context_handle_t_ : _ur_object { + ur_context_handle_t_(ze_context_handle_t hContext, uint32_t numDevices, + const ur_device_handle_t *phDevices, bool ownZeContext); -namespace v2 { + ur_result_t retain(); + ur_result_t release(); -struct ur_context_handle_t_; -using ur_context_handle_t = ur_context_handle_t_ *; + inline ze_context_handle_t getZeHandle() const { return hContext.get(); } + ur_platform_handle_t getPlatform() const; + const std::vector &getDevices() const; + ur_usm_pool_handle_t getDefaultUSMPool(); + const std::vector & + getP2PDevices(ur_device_handle_t hDevice) const; -struct ur_context_handle_t_ : public ::ur_context_handle_t_ { - ur_context_handle_t_(ze_context_handle_t hContext, uint32_t numDevices, - const ur_device_handle_t *phDevices, bool ownZeContext); + // Checks if Device is covered by this context. + // For that the Device or its root devices need to be in the context. + bool isValidDevice(ur_device_handle_t Device) const; - command_list_cache_t commandListCache; -}; + v2::command_list_cache_t commandListCache; + v2::event_pool_cache eventPoolCache; + +private: + const v2::raii::ze_context_handle_t hContext; + const std::vector hDevices; -} // namespace v2 + // P2P devices for each device in the context, indexed by device id. + const std::vector> p2pAccessDevices; + + ur_usm_pool_handle_t_ defaultUSMPool; +}; diff --git a/source/adapters/level_zero/v2/event.cpp b/source/adapters/level_zero/v2/event.cpp index 1037553841..ba42b1ba0b 100644 --- a/source/adapters/level_zero/v2/event.cpp +++ b/source/adapters/level_zero/v2/event.cpp @@ -7,28 +7,247 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// + +#include + #include "event.hpp" -#include "adapters/level_zero/v2/event_provider.hpp" -#include "ze_api.h" +#include "event_pool.hpp" +#include "event_provider.hpp" -namespace v2 { -void ur_event::attachZeHandle(event_allocation event) { - type = event.type; - zeEvent = std::move(event.borrow); -} +#include "../ur_interface_loader.hpp" + +ur_event_handle_t_::ur_event_handle_t_( + v2::raii::cache_borrowed_event eventAllocation, v2::event_pool *pool) + : zeEvent(std::move(eventAllocation)), pool(pool), + adjustedEventStartTimestamp(0), recordEventEndTimestamp(0), + adjustedEventEndTimestamp(0), + zeTimerResolution(getDevice()->ZeDeviceProperties->timerResolution), + timestampMaxValue(getDevice()->getTimestampMask()) {} -event_borrowed ur_event::detachZeHandle() { +void ur_event_handle_t_::reset() { // consider make an abstraction for regular/counter based // events if there's more of this type of conditions - if (type == event_type::EVENT_REGULAR) { + if (pool->getFlags() & v2::EVENT_FLAGS_COUNTER) { zeEventHostReset(zeEvent.get()); } - auto e = std::move(zeEvent); - zeEvent = nullptr; +} + +ze_event_handle_t ur_event_handle_t_::getZeEvent() const { + return zeEvent.get(); +} + +ur_result_t ur_event_handle_t_::retain() { + RefCount.increment(); + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_event_handle_t_::release() { + if (!RefCount.decrementAndTest()) + return UR_RESULT_SUCCESS; + + if (isTimestamped() && adjustedEventEndTimestamp == 0) { + // L0 will write end timestamp to this event some time in the future, + // so we can't release it yet. + // TODO: delay releasing until the end timestamp is written. + return UR_RESULT_SUCCESS; + } + + pool->free(this); + + return UR_RESULT_SUCCESS; +} + +bool ur_event_handle_t_::isTimestamped() const { + // If we are recording, the start time of the event will be non-zero. + return adjustedEventStartTimestamp != 0; +} + +bool ur_event_handle_t_::isProfilingEnabled() const { + return pool->getFlags() & v2::EVENT_FLAGS_PROFILING_ENABLED; +} + +ur_device_handle_t ur_event_handle_t_::getDevice() const { + return pool->getProvider()->device(); +} + +uint64_t ur_event_handle_t_::getEventStartTimestmap() const { + return adjustedEventStartTimestamp; +} + +static uint64_t adjustEndEventTimestamp(uint64_t adjustedStartTimestamp, + uint64_t endTimestamp, + uint64_t timestampMaxValue, + uint64_t timerResolution) { + // End time needs to be adjusted for resolution and valid bits. + uint64_t adjustedTimestamp = + (endTimestamp & timestampMaxValue) * timerResolution; - return e; + // Handle a possible wrap-around (the underlying HW counter is < 64-bit). + // Note, it will not report correct time if there were multiple wrap + // arounds, and the longer term plan is to enlarge the capacity of the + // HW timestamps. + if (adjustedTimestamp < adjustedStartTimestamp) + adjustedTimestamp += timestampMaxValue * timerResolution; + + return adjustedTimestamp; +} + +uint64_t ur_event_handle_t_::getEventEndTimestamp() { + std::scoped_lock lock(this->Mutex); + + // If adjustedEventEndTimestamp on the event is non-zero it means it has + // collected the result of the queue already. In that case it has been + // adjusted and is ready for immediate return. + if (adjustedEventEndTimestamp) + return adjustedEventEndTimestamp; + + // If the result is 0, we have not yet gotten results back and so we just + // return it. + if (recordEventEndTimestamp == 0) + return recordEventEndTimestamp; + + // Now that we have the result, there is no need to keep it in the queue + // anymore, so we cache it on the event and evict the record from the + // queue. + adjustedEventEndTimestamp = + adjustEndEventTimestamp(getEventStartTimestmap(), recordEventEndTimestamp, + timestampMaxValue, zeTimerResolution); + return adjustedEventEndTimestamp; } -ze_event_handle_t ur_event::getZeEvent() { return zeEvent.get(); } +void ur_event_handle_t_::recordStartTimestamp() { + uint64_t deviceStartTimestamp = 0; + UR_CALL_THROWS(ur::level_zero::urDeviceGetGlobalTimestamps( + getDevice(), &deviceStartTimestamp, nullptr)); -} // namespace v2 + std::scoped_lock lock(this->Mutex); + + adjustedEventStartTimestamp = deviceStartTimestamp; +} + +uint64_t *ur_event_handle_t_::getEventEndTimestampPtr() { + return &recordEventEndTimestamp; +} + +namespace ur::level_zero { +ur_result_t urEventRetain(ur_event_handle_t hEvent) { return hEvent->retain(); } + +ur_result_t urEventRelease(ur_event_handle_t hEvent) { + return hEvent->release(); +} + +ur_result_t urEventWait(uint32_t numEvents, + const ur_event_handle_t *phEventWaitList) { + for (uint32_t i = 0; i < numEvents; ++i) { + ZE2UR_CALL(zeEventHostSynchronize, + (phEventWaitList[i]->getZeEvent(), UINT64_MAX)); + } + return UR_RESULT_SUCCESS; +} + +ur_result_t urEventGetInfo(ur_event_handle_t hEvent, ur_event_info_t propName, + size_t propValueSize, void *pPropValue, + size_t *pPropValueSizeRet) { + UrReturnHelper returnValue(propValueSize, pPropValue, pPropValueSizeRet); + + switch (propName) { + case UR_EVENT_INFO_COMMAND_EXECUTION_STATUS: { + auto zeStatus = ZE_CALL_NOCHECK(zeEventQueryStatus, (hEvent->getZeEvent())); + + if (zeStatus == ZE_RESULT_NOT_READY) { + return returnValue(UR_EVENT_STATUS_SUBMITTED); + } else { + return returnValue(UR_EVENT_STATUS_COMPLETE); + } + } + case UR_EVENT_INFO_REFERENCE_COUNT: { + return returnValue(hEvent->RefCount.load()); + } + default: + logger::error( + "Unsupported ParamName in urEventGetInfo: ParamName=ParamName={}(0x{})", + propName, logger::toHex(propName)); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t urEventGetProfilingInfo( + ur_event_handle_t hEvent, ///< [in] handle of the event object + ur_profiling_info_t + propName, ///< [in] the name of the profiling property to query + size_t + propValueSize, ///< [in] size in bytes of the profiling property value + void *pPropValue, ///< [out][optional] value of the profiling property + size_t *pPropValueSizeRet ///< [out][optional] pointer to the actual size in + ///< bytes returned in propValue +) { + // The event must either have profiling enabled or be recording timestamps. + bool isTimestampedEvent = hEvent->isTimestamped(); + if (!hEvent->isProfilingEnabled() && !isTimestampedEvent) { + return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; + } + + UrReturnHelper returnValue(propValueSize, pPropValue, pPropValueSizeRet); + + // For timestamped events we have the timestamps ready directly on the event + // handle, so we short-circuit the return. + if (isTimestampedEvent) { + uint64_t contextStartTime = hEvent->getEventStartTimestmap(); + switch (propName) { + case UR_PROFILING_INFO_COMMAND_QUEUED: + case UR_PROFILING_INFO_COMMAND_SUBMIT: + return returnValue(contextStartTime); + case UR_PROFILING_INFO_COMMAND_END: + case UR_PROFILING_INFO_COMMAND_START: + case UR_PROFILING_INFO_COMMAND_COMPLETE: { + return returnValue(hEvent->getEventEndTimestamp()); + } + default: + logger::error("urEventGetProfilingInfo: not supported ParamName"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + } + + ze_kernel_timestamp_result_t tsResult; + + auto zeTimerResolution = + hEvent->getDevice()->ZeDeviceProperties->timerResolution; + auto timestampMaxValue = hEvent->getDevice()->getTimestampMask(); + + switch (propName) { + case UR_PROFILING_INFO_COMMAND_START: { + ZE2UR_CALL(zeEventQueryKernelTimestamp, (hEvent->getZeEvent(), &tsResult)); + uint64_t contextStartTime = + (tsResult.global.kernelStart & timestampMaxValue) * zeTimerResolution; + return returnValue(contextStartTime); + } + case UR_PROFILING_INFO_COMMAND_END: + case UR_PROFILING_INFO_COMMAND_COMPLETE: { + ZE2UR_CALL(zeEventQueryKernelTimestamp, (hEvent->getZeEvent(), &tsResult)); + + uint64_t contextStartTime = + (tsResult.global.kernelStart & timestampMaxValue); + + auto adjustedEndTime = + adjustEndEventTimestamp(contextStartTime, tsResult.global.kernelEnd, + timestampMaxValue, zeTimerResolution); + return returnValue(adjustedEndTime); + } + case UR_PROFILING_INFO_COMMAND_QUEUED: + case UR_PROFILING_INFO_COMMAND_SUBMIT: + // Note: No users for this case + // The "command_submit" time is implemented by recording submission + // timestamp with a call to urDeviceGetGlobalTimestamps before command + // enqueue. + // + return returnValue(uint64_t{0}); + default: + logger::error("urEventGetProfilingInfo: not supported ParamName"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + return UR_RESULT_SUCCESS; +} +} // namespace ur::level_zero diff --git a/source/adapters/level_zero/v2/event.hpp b/source/adapters/level_zero/v2/event.hpp index 4f617d11f3..394f139b30 100644 --- a/source/adapters/level_zero/v2/event.hpp +++ b/source/adapters/level_zero/v2/event.hpp @@ -15,20 +15,48 @@ #include #include +#include "common.hpp" #include "event_provider.hpp" namespace v2 { +class event_pool; +} -class ur_event { +struct ur_event_handle_t_ : _ur_object { public: - void attachZeHandle(event_allocation); - event_borrowed detachZeHandle(); + ur_event_handle_t_(v2::raii::cache_borrowed_event eventAllocation, + v2::event_pool *pool); - ze_event_handle_t getZeEvent(); + void reset(); + ze_event_handle_t getZeEvent() const; + + ur_result_t retain(); + ur_result_t release(); + + // Tells if this event was created as a timestamp event, allowing profiling + // info even if profiling is not enabled. + bool isTimestamped() const; + + // Tells if this event comes from a pool that has profiling enabled. + bool isProfilingEnabled() const; + + // Device associated with this event + ur_device_handle_t getDevice() const; + + void recordStartTimestamp(); + uint64_t *getEventEndTimestampPtr(); + + uint64_t getEventStartTimestmap() const; + uint64_t getEventEndTimestamp(); private: - event_type type; - event_borrowed zeEvent; -}; + v2::raii::cache_borrowed_event zeEvent; + v2::event_pool *pool; -} // namespace v2 + uint64_t adjustedEventStartTimestamp; + uint64_t recordEventEndTimestamp; + uint64_t adjustedEventEndTimestamp; + + const uint64_t zeTimerResolution; + const uint64_t timestampMaxValue; +}; diff --git a/source/adapters/level_zero/v2/event_pool.cpp b/source/adapters/level_zero/v2/event_pool.cpp index 7ba2d0f9af..fe63681764 100644 --- a/source/adapters/level_zero/v2/event_pool.cpp +++ b/source/adapters/level_zero/v2/event_pool.cpp @@ -7,37 +7,51 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +#include "event_pool.hpp" +#include "common/latency_tracker.hpp" #include "ur_api.h" -#include namespace v2 { static constexpr size_t EVENTS_BURST = 64; -ur_event *event_pool::allocate() { +ur_event_handle_t_ *event_pool::allocate() { + TRACK_SCOPE_LATENCY("event_pool::allocate"); + + std::unique_lock lock(*mutex); + if (freelist.empty()) { auto start = events.size(); auto end = start + EVENTS_BURST; - events.resize(end); for (; start < end; ++start) { + events.emplace_back(provider->allocate(), this); freelist.push_back(&events.at(start)); } } auto event = freelist.back(); - - auto ZeEvent = provider->allocate(); - event->attachZeHandle(std::move(ZeEvent)); - freelist.pop_back(); return event; } -void event_pool::free(ur_event *event) { - auto _ = event->detachZeHandle(); +void event_pool::free(ur_event_handle_t_ *event) { + TRACK_SCOPE_LATENCY("event_pool::free"); + + std::unique_lock lock(*mutex); + event->reset(); freelist.push_back(event); + + // The event is still in the pool, so we need to increment the refcount + assert(event->RefCount.load() == 0); + event->RefCount.increment(); +} + +event_provider *event_pool::getProvider() const { return provider.get(); } + +event_flags_t event_pool::getFlags() const { + return getProvider()->eventFlags(); } } // namespace v2 diff --git a/source/adapters/level_zero/v2/event_pool.hpp b/source/adapters/level_zero/v2/event_pool.hpp index 8976daa939..924d29b907 100644 --- a/source/adapters/level_zero/v2/event_pool.hpp +++ b/source/adapters/level_zero/v2/event_pool.hpp @@ -19,8 +19,8 @@ #include #include +#include "../common.hpp" #include "../device.hpp" -#include "common.hpp" #include "event.hpp" #include "event_provider.hpp" @@ -28,8 +28,9 @@ namespace v2 { class event_pool { public: + // store weak reference to the queue as event_pool is part of the queue event_pool(std::unique_ptr Provider) - : provider(std::move(Provider)){}; + : provider(std::move(Provider)), mutex(std::make_unique()){}; event_pool(event_pool &&other) = default; event_pool &operator=(event_pool &&other) = default; @@ -37,16 +38,24 @@ class event_pool { event_pool(const event_pool &) = delete; event_pool &operator=(const event_pool &) = delete; - DeviceId Id() { return provider->device()->Id; }; + DeviceId Id() { return provider->device()->Id.value(); }; - ur_event *allocate(); - void free(ur_event *event); + // Allocate an event from the pool. Thread safe. + ur_event_handle_t_ *allocate(); -private: - std::deque events; - std::vector freelist; + // Free an event back to the pool. Thread safe. + void free(ur_event_handle_t_ *event); + + event_provider *getProvider() const; + event_flags_t getFlags() const; +private: std::unique_ptr provider; + + std::deque events; + std::vector freelist; + + std::unique_ptr mutex; }; } // namespace v2 diff --git a/source/adapters/level_zero/v2/event_pool_cache.cpp b/source/adapters/level_zero/v2/event_pool_cache.cpp index b7064623f4..f0d16bed02 100644 --- a/source/adapters/level_zero/v2/event_pool_cache.cpp +++ b/source/adapters/level_zero/v2/event_pool_cache.cpp @@ -8,37 +8,40 @@ // //===----------------------------------------------------------------------===// #include "event_pool_cache.hpp" -#include "device.hpp" -#include "platform.hpp" +#include "../device.hpp" +#include "../platform.hpp" namespace v2 { event_pool_cache::event_pool_cache(size_t max_devices, ProviderCreateFunc ProviderCreate) : providerCreate(ProviderCreate) { - pools.resize(max_devices); + pools.resize(max_devices * (1ULL << EVENT_FLAGS_USED_BITS)); } event_pool_cache::~event_pool_cache() {} -event_pool_borrowed event_pool_cache::borrow(DeviceId id) { +raii::cache_borrowed_event_pool event_pool_cache::borrow(DeviceId id, + event_flags_t flags) { std::unique_lock Lock(mutex); - if (id >= pools.size()) { + event_descriptor event_desc{id, flags}; + + if (event_desc.index() >= pools.size()) { return nullptr; } - auto &vec = pools[id]; + auto &vec = pools[event_desc.index()]; if (vec.empty()) { - vec.emplace_back(std::make_unique(providerCreate(id))); + vec.emplace_back(std::make_unique(providerCreate(id, flags))); } auto pool = vec.back().release(); vec.pop_back(); - return event_pool_borrowed(pool, [this](event_pool *pool) { + return raii::cache_borrowed_event_pool(pool, [this, flags](event_pool *pool) { std::unique_lock Lock(mutex); - pools[pool->Id()].emplace_back(pool); + pools[event_descriptor{pool->Id(), flags}.index()].emplace_back(pool); }); } diff --git a/source/adapters/level_zero/v2/event_pool_cache.hpp b/source/adapters/level_zero/v2/event_pool_cache.hpp index eff98b28ed..78d909182c 100644 --- a/source/adapters/level_zero/v2/event_pool_cache.hpp +++ b/source/adapters/level_zero/v2/event_pool_cache.hpp @@ -25,22 +25,35 @@ namespace v2 { -using event_pool_borrowed = +namespace raii { +using cache_borrowed_event_pool = std::unique_ptr>; +} // namespace raii class event_pool_cache { public: - using ProviderCreateFunc = - std::function(DeviceId)>; + using ProviderCreateFunc = std::function( + DeviceId, event_flags_t flags)>; event_pool_cache(size_t max_devices, ProviderCreateFunc); ~event_pool_cache(); - event_pool_borrowed borrow(DeviceId); + raii::cache_borrowed_event_pool borrow(DeviceId, event_flags_t flags); private: ur_mutex mutex; ProviderCreateFunc providerCreate; + + struct event_descriptor { + DeviceId device; + event_flags_t flags; + + uint64_t index() { + return uint64_t(flags) | (uint64_t(device) << EVENT_FLAGS_USED_BITS); + } + }; + + // Indexed by event_descriptor::index() std::vector>> pools; }; diff --git a/source/adapters/level_zero/v2/event_provider.hpp b/source/adapters/level_zero/v2/event_provider.hpp index 2cc256ab93..1fb87a8b6a 100644 --- a/source/adapters/level_zero/v2/event_provider.hpp +++ b/source/adapters/level_zero/v2/event_provider.hpp @@ -21,21 +21,27 @@ namespace v2 { -enum event_type { EVENT_REGULAR, EVENT_COUNTER }; +using event_flags_t = uint32_t; +enum event_flag_t { + EVENT_FLAGS_COUNTER = UR_BIT(0), + EVENT_FLAGS_PROFILING_ENABLED = UR_BIT(1), +}; +static constexpr size_t EVENT_FLAGS_USED_BITS = 2; -using event_borrowed = - std::unique_ptr<_ze_event_handle_t, std::function>; +class event_provider; -struct event_allocation { - event_type type; - event_borrowed borrow; -}; +namespace raii { +using cache_borrowed_event = + std::unique_ptr<_ze_event_handle_t, + std::function>; +} // namespace raii class event_provider { public: virtual ~event_provider() = default; - virtual event_allocation allocate() = 0; + virtual raii::cache_borrowed_event allocate() = 0; virtual ur_device_handle_t device() = 0; + virtual event_flags_t eventFlags() const = 0; }; } // namespace v2 diff --git a/source/adapters/level_zero/v2/event_provider_counter.cpp b/source/adapters/level_zero/v2/event_provider_counter.cpp index 14e33a5700..1adc9a26e0 100644 --- a/source/adapters/level_zero/v2/event_provider_counter.cpp +++ b/source/adapters/level_zero/v2/event_provider_counter.cpp @@ -7,14 +7,16 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +#include +#include + +#include "context.hpp" +#include "event_provider.hpp" #include "event_provider_counter.hpp" -#include "../context.hpp" +#include "loader/ze_loader.h" + #include "../device.hpp" #include "../platform.hpp" -#include "event_provider.hpp" -#include "loader/ze_loader.h" -#include "ur_api.h" -#include "ze_api.h" namespace v2 { @@ -25,25 +27,19 @@ provider_counter::provider_counter(ur_platform_handle_t platform, ZE2UR_CALL_THROWS(zeDriverGetExtensionFunctionAddress, (platform->ZeDriver, "zexCounterBasedEventCreate", (void **)&this->eventCreateFunc)); - ZE2UR_CALL_THROWS( - zelLoaderTranslateHandle, - (ZEL_HANDLE_CONTEXT, context->ZeContext, (void **)&translatedContext)); + ZE2UR_CALL_THROWS(zelLoaderTranslateHandle, + (ZEL_HANDLE_CONTEXT, context->getZeHandle(), + (void **)&translatedContext)); ZE2UR_CALL_THROWS( zelLoaderTranslateHandle, (ZEL_HANDLE_DEVICE, device->ZeDevice, (void **)&translatedDevice)); } -provider_counter::~provider_counter() { - for (auto &e : freelist) { - ZE_CALL_NOCHECK(zeEventDestroy, (e)); - } -} - -event_allocation provider_counter::allocate() { +raii::cache_borrowed_event provider_counter::allocate() { if (freelist.empty()) { ZeStruct desc; desc.index = 0; - desc.signal = 0; + desc.signal = ZE_EVENT_SCOPE_FLAG_HOST; desc.wait = 0; ze_event_handle_t handle; @@ -54,15 +50,18 @@ event_allocation provider_counter::allocate() { freelist.emplace_back(handle); } - auto event = freelist.back(); + auto event = std::move(freelist.back()); freelist.pop_back(); - return {event_type::EVENT_COUNTER, - event_borrowed(event, [this](ze_event_handle_t handle) { - freelist.push_back(handle); - })}; + return raii::cache_borrowed_event( + event.release(), + [this](ze_event_handle_t handle) { freelist.push_back(handle); }); } ur_device_handle_t provider_counter::device() { return urDevice; } +event_flags_t provider_counter::eventFlags() const { + return EVENT_FLAGS_COUNTER; +} + } // namespace v2 diff --git a/source/adapters/level_zero/v2/event_provider_counter.hpp b/source/adapters/level_zero/v2/event_provider_counter.hpp index 60a8107469..98e405cc3f 100644 --- a/source/adapters/level_zero/v2/event_provider_counter.hpp +++ b/source/adapters/level_zero/v2/event_provider_counter.hpp @@ -19,11 +19,12 @@ #include #include -#include "../device.hpp" #include "common.hpp" #include "event.hpp" #include "event_provider.hpp" +#include "../device.hpp" + namespace v2 { typedef ze_result_t (*zexCounterBasedEventCreate)( @@ -33,12 +34,13 @@ typedef ze_result_t (*zexCounterBasedEventCreate)( class provider_counter : public event_provider { public: + // TODO: does this provider support profiling? provider_counter(ur_platform_handle_t platform, ur_context_handle_t, ur_device_handle_t); - ~provider_counter() override; - event_allocation allocate() override; + raii::cache_borrowed_event allocate() override; ur_device_handle_t device() override; + event_flags_t eventFlags() const override; private: ur_device_handle_t urDevice; @@ -48,7 +50,7 @@ class provider_counter : public event_provider { zexCounterBasedEventCreate eventCreateFunc; - std::vector freelist; + std::vector freelist; }; } // namespace v2 diff --git a/source/adapters/level_zero/v2/event_provider_normal.cpp b/source/adapters/level_zero/v2/event_provider_normal.cpp index 4e8287b36c..d2d7f3a198 100644 --- a/source/adapters/level_zero/v2/event_provider_normal.cpp +++ b/source/adapters/level_zero/v2/event_provider_normal.cpp @@ -7,27 +7,34 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// + +#include +#include + +#include + +#include "context.hpp" +#include "event_provider.hpp" #include "event_provider_normal.hpp" + +#include "../common/latency_tracker.hpp" + #include "../common.hpp" -#include "../context.hpp" -#include "event_provider.hpp" -#include "ur_api.h" -#include "ze_api.h" -#include namespace v2 { static constexpr int EVENTS_BURST = 64; provider_pool::provider_pool(ur_context_handle_t context, - ur_device_handle_t device, event_type events, - queue_type queue) { + ur_device_handle_t device, queue_type queue, + event_flags_t flags) { ZeStruct desc; desc.count = EVENTS_BURST; - desc.flags = 0; + desc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + + ze_event_pool_counter_based_exp_desc_t counterBasedExt = { + ZE_STRUCTURE_TYPE_COUNTER_BASED_EVENT_POOL_EXP_DESC, nullptr, 0}; - if (events == event_type::EVENT_COUNTER) { - ze_event_pool_counter_based_exp_desc_t counterBasedExt = { - ZE_STRUCTURE_TYPE_COUNTER_BASED_EVENT_POOL_EXP_DESC, nullptr}; + if (flags & EVENT_FLAGS_COUNTER) { counterBasedExt.flags = queue == queue_type::QUEUE_IMMEDIATE ? ZE_EVENT_POOL_COUNTER_BASED_EXP_FLAG_IMMEDIATE @@ -35,56 +42,45 @@ provider_pool::provider_pool(ur_context_handle_t context, desc.pNext = &counterBasedExt; } + if (flags & EVENT_FLAGS_PROFILING_ENABLED) { + desc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + } + ZE2UR_CALL_THROWS(zeEventPoolCreate, - (context->ZeContext, &desc, 1, + (context->getZeHandle(), &desc, 1, const_cast(&device->ZeDevice), - &pool)); + pool.ptr())); freelist.resize(EVENTS_BURST); for (int i = 0; i < EVENTS_BURST; ++i) { ZeStruct desc; desc.index = i; - desc.signal = 0; + desc.signal = ZE_EVENT_SCOPE_FLAG_HOST; desc.wait = 0; - ZE2UR_CALL_THROWS(zeEventCreate, (pool, &desc, &freelist[i])); - } -} - -provider_pool::~provider_pool() { - for (auto e : freelist) { - ZE_CALL_NOCHECK(zeEventDestroy, (e)); + ZE2UR_CALL_THROWS(zeEventCreate, (pool.get(), &desc, freelist[i].ptr())); } - ZE_CALL_NOCHECK(zeEventPoolDestroy, (pool)); } -event_borrowed provider_pool::allocate() { +raii::cache_borrowed_event provider_pool::allocate() { if (freelist.empty()) { return nullptr; } - ze_event_handle_t e = freelist.back(); + auto e = std::move(freelist.back()); freelist.pop_back(); - return event_borrowed( - e, [this](ze_event_handle_t handle) { freelist.push_back(handle); }); + return raii::cache_borrowed_event( + e.release(), + [this](ze_event_handle_t handle) { freelist.push_back(handle); }); } size_t provider_pool::nfree() const { return freelist.size(); } -provider_normal::provider_normal(ur_context_handle_t context, - ur_device_handle_t device, event_type etype, - queue_type qtype) - : producedType(etype), queueType(qtype), urContext(context), - urDevice(device) { - urDeviceRetain(device); -} - -provider_normal::~provider_normal() { urDeviceRelease(urDevice); } - std::unique_ptr provider_normal::createProviderPool() { - return std::make_unique(urContext, urDevice, producedType, - queueType); + return std::make_unique(urContext, urDevice, queueType, flags); } -event_allocation provider_normal::allocate() { +raii::cache_borrowed_event provider_normal::allocate() { + TRACK_SCOPE_LATENCY("provider_normal::allocate"); + if (pools.empty()) { pools.emplace_back(createProviderPool()); } @@ -93,7 +89,7 @@ event_allocation provider_normal::allocate() { auto &pool = pools.back(); auto event = pool->allocate(); if (event) { - return {producedType, std::move(event)}; + return event; } } @@ -105,7 +101,7 @@ event_allocation provider_normal::allocate() { auto &pool = pools.back(); auto event = pool->allocate(); if (event) { - return {producedType, std::move(event)}; + return event; } } @@ -116,4 +112,6 @@ event_allocation provider_normal::allocate() { ur_device_handle_t provider_normal::device() { return urDevice; } +event_flags_t provider_normal::eventFlags() const { return flags; } + } // namespace v2 diff --git a/source/adapters/level_zero/v2/event_provider_normal.hpp b/source/adapters/level_zero/v2/event_provider_normal.hpp index 4ab72ccaed..a0f672b944 100644 --- a/source/adapters/level_zero/v2/event_provider_normal.hpp +++ b/source/adapters/level_zero/v2/event_provider_normal.hpp @@ -19,10 +19,12 @@ #include #include -#include "../device.hpp" #include "common.hpp" #include "event.hpp" +#include "../device.hpp" +#include "../ur_interface_loader.hpp" + namespace v2 { enum queue_type { @@ -32,35 +34,36 @@ enum queue_type { class provider_pool { public: - provider_pool(ur_context_handle_t, ur_device_handle_t, event_type, - queue_type); - ~provider_pool(); + provider_pool(ur_context_handle_t, ur_device_handle_t, queue_type, + event_flags_t flags); - event_borrowed allocate(); + raii::cache_borrowed_event allocate(); size_t nfree() const; private: - // TODO: use a RAII wrapper for the pool handle - ze_event_pool_handle_t pool; - - std::vector freelist; + raii::ze_event_pool_handle_t pool; + std::vector freelist; }; class provider_normal : public event_provider { public: - provider_normal(ur_context_handle_t, ur_device_handle_t, event_type, - queue_type); + provider_normal(ur_context_handle_t context, ur_device_handle_t device, + queue_type qtype, event_flags_t flags) + : queueType(qtype), urContext(context), urDevice(device), flags(flags) { + ur::level_zero::urDeviceRetain(device); + } - ~provider_normal() override; + ~provider_normal() override { ur::level_zero::urDeviceRelease(urDevice); } - event_allocation allocate() override; + raii::cache_borrowed_event allocate() override; ur_device_handle_t device() override; + event_flags_t eventFlags() const override; private: - event_type producedType; queue_type queueType; ur_context_handle_t urContext; ur_device_handle_t urDevice; + event_flags_t flags; std::unique_ptr createProviderPool(); std::vector> pools; diff --git a/source/adapters/level_zero/v2/kernel.cpp b/source/adapters/level_zero/v2/kernel.cpp new file mode 100644 index 0000000000..e98221b9e5 --- /dev/null +++ b/source/adapters/level_zero/v2/kernel.cpp @@ -0,0 +1,473 @@ +//===--------- kernel.cpp - Level Zero Adapter ---------------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#include "context.hpp" +#include "kernel.hpp" +#include "memory.hpp" + +#include "../device.hpp" +#include "../platform.hpp" +#include "../program.hpp" +#include "../ur_interface_loader.hpp" + +ur_single_device_kernel_t::ur_single_device_kernel_t(ur_device_handle_t hDevice, + ze_kernel_handle_t hKernel, + bool ownZeHandle) + : hDevice(hDevice), hKernel(hKernel, ownZeHandle) { + zeKernelProperties.Compute = + [hKernel = hKernel](ze_kernel_properties_t &properties) { + ZE_CALL_NOCHECK(zeKernelGetProperties, (hKernel, &properties)); + }; +} + +ur_result_t ur_single_device_kernel_t::release() { + hKernel.reset(); + return UR_RESULT_SUCCESS; +} + +ur_kernel_handle_t_::ur_kernel_handle_t_(ur_program_handle_t hProgram, + const char *kernelName) + : hProgram(hProgram), + deviceKernels(hProgram->Context->getPlatform()->getNumDevices()) { + ur::level_zero::urProgramRetain(hProgram); + + for (auto [zeDevice, zeModule] : hProgram->ZeModuleMap) { + ZeStruct zeKernelDesc; + zeKernelDesc.pKernelName = kernelName; + + ze_kernel_handle_t zeKernel; + ZE2UR_CALL_THROWS(zeKernelCreate, (zeModule, &zeKernelDesc, &zeKernel)); + + auto urDevice = std::find_if(hProgram->Context->getDevices().begin(), + hProgram->Context->getDevices().end(), + [zeDevice = zeDevice](const auto &urDevice) { + return urDevice->ZeDevice == zeDevice; + }); + assert(urDevice != hProgram->Context->getDevices().end()); + auto deviceId = (*urDevice)->Id.value(); + + deviceKernels[deviceId].emplace(*urDevice, zeKernel, true); + } + completeInitialization(); +} + +ur_kernel_handle_t_::ur_kernel_handle_t_( + ur_native_handle_t hNativeKernel, ur_program_handle_t hProgram, + const ur_kernel_native_properties_t *pProperties) + : hProgram(hProgram), deviceKernels(1) { + ze_kernel_handle_t zeKernel = ur_cast(hNativeKernel); + + if (!zeKernel) { + throw UR_RESULT_ERROR_INVALID_KERNEL; + } + + deviceKernels.back().emplace(nullptr, zeKernel, + pProperties->isNativeHandleOwned); + completeInitialization(); +} + +ur_result_t ur_kernel_handle_t_::release() { + // manually release kernels to allow errors to be propagated + for (auto &singleDeviceKernelOpt : deviceKernels) { + if (singleDeviceKernelOpt.has_value()) { + singleDeviceKernelOpt.value().hKernel.reset(); + } + } + + UR_CALL_THROWS(ur::level_zero::urProgramRelease(hProgram)); + + return UR_RESULT_SUCCESS; +} + +void ur_kernel_handle_t_::completeInitialization() { + // Cache kernel name. Should be the same for all devices + assert(deviceKernels.size() > 0); + auto nonEmptyKernel = + std::find_if(deviceKernels.begin(), deviceKernels.end(), + [](const auto &kernel) { return kernel.has_value(); }); + + zeKernelName.Compute = [kernel = + &nonEmptyKernel->value()](std::string &name) { + size_t size = 0; + ZE_CALL_NOCHECK(zeKernelGetName, (kernel->hKernel.get(), &size, nullptr)); + name.resize(size); + ZE_CALL_NOCHECK(zeKernelGetName, + (kernel->hKernel.get(), &size, name.data())); + }; +} + +ze_kernel_handle_t +ur_kernel_handle_t_::getZeHandle(ur_device_handle_t hDevice) { + // root-device's kernel can be submitted to a sub-device's queue + if (hDevice->isSubDevice()) { + hDevice = hDevice->RootDevice; + } + + if (deviceKernels.size() == 1) { + assert(deviceKernels[0].has_value()); + assert(deviceKernels[0].value().hKernel.get()); + + auto &kernel = deviceKernels[0].value(); + + // hDevice is nullptr for native handle + if ((kernel.hDevice != nullptr && kernel.hDevice != hDevice)) { + throw UR_RESULT_ERROR_INVALID_DEVICE; + } + + return kernel.hKernel.get(); + } + + if (!deviceKernels[hDevice->Id.value()].has_value()) { + throw UR_RESULT_ERROR_INVALID_DEVICE; + } + + assert(deviceKernels[hDevice->Id.value()].value().hKernel.get()); + + return deviceKernels[hDevice->Id.value()].value().hKernel.get(); +} + +const std::string &ur_kernel_handle_t_::getName() const { + return *zeKernelName.operator->(); +} + +const ze_kernel_properties_t & +ur_kernel_handle_t_::getProperties(ur_device_handle_t hDevice) const { + if (!deviceKernels[hDevice->Id.value()].has_value()) { + throw UR_RESULT_ERROR_INVALID_DEVICE; + } + + assert(deviceKernels[hDevice->Id.value()].value().hKernel.get()); + + return *deviceKernels[hDevice->Id.value()] + .value() + .zeKernelProperties. + operator->(); +} + +ur_result_t ur_kernel_handle_t_::setArgValue( + uint32_t argIndex, size_t argSize, + const ur_kernel_arg_value_properties_t *pProperties, + const void *pArgValue) { + std::ignore = pProperties; + + // OpenCL: "the arg_value pointer can be NULL or point to a NULL value + // in which case a NULL value will be used as the value for the argument + // declared as a pointer to global or constant memory in the kernel" + // + // We don't know the type of the argument but it seems that the only time + // SYCL RT would send a pointer to NULL in 'arg_value' is when the argument + // is a NULL pointer. Treat a pointer to NULL in 'arg_value' as a NULL. + if (argSize == sizeof(void *) && pArgValue && + *(void **)(const_cast(pArgValue)) == nullptr) { + pArgValue = nullptr; + } + + std::scoped_lock guard(Mutex); + + for (auto &singleDeviceKernel : deviceKernels) { + if (!singleDeviceKernel.has_value()) { + continue; + } + + ZE2UR_CALL(zeKernelSetArgumentValue, + (singleDeviceKernel.value().hKernel.get(), argIndex, argSize, + pArgValue)); + } + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_kernel_handle_t_::setArgPointer( + uint32_t argIndex, const ur_kernel_arg_pointer_properties_t *pProperties, + const void *pArgValue) { + std::ignore = pProperties; + + // KernelSetArgValue is expecting a pointer to the argument + return setArgValue(argIndex, sizeof(const void *), nullptr, &pArgValue); +} + +ur_program_handle_t ur_kernel_handle_t_::getProgramHandle() const { + return hProgram; +} + +ur_result_t ur_kernel_handle_t_::setExecInfo(ur_kernel_exec_info_t propName, + const void *pPropValue) { + std::scoped_lock Guard(Mutex); + + for (auto &kernel : deviceKernels) { + if (!kernel.has_value()) + continue; + if (propName == UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS && + *(static_cast(pPropValue)) == true) { + // The whole point for users really was to not need to know anything + // about the types of allocations kernel uses. So in DPC++ we always + // just set all 3 modes for each kernel. + ze_kernel_indirect_access_flags_t indirectFlags = + ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST | + ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE | + ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED; + ZE2UR_CALL(zeKernelSetIndirectAccess, + (kernel->hKernel.get(), indirectFlags)); + } else if (propName == UR_KERNEL_EXEC_INFO_CACHE_CONFIG) { + ze_cache_config_flag_t zeCacheConfig{}; + auto cacheConfig = + *(static_cast(pPropValue)); + if (cacheConfig == UR_KERNEL_CACHE_CONFIG_LARGE_SLM) + zeCacheConfig = ZE_CACHE_CONFIG_FLAG_LARGE_SLM; + else if (cacheConfig == UR_KERNEL_CACHE_CONFIG_LARGE_DATA) + zeCacheConfig = ZE_CACHE_CONFIG_FLAG_LARGE_DATA; + else if (cacheConfig == UR_KERNEL_CACHE_CONFIG_DEFAULT) + zeCacheConfig = static_cast(0); + else + // Unexpected cache configuration value. + return UR_RESULT_ERROR_INVALID_VALUE; + ZE2UR_CALL(zeKernelSetCacheConfig, + (kernel->hKernel.get(), zeCacheConfig);); + } else { + logger::error("urKernelSetExecInfo: unsupported ParamName"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + } + + return UR_RESULT_SUCCESS; +} + +std::vector ur_kernel_handle_t_::getDevices() const { + std::vector devices; + for (size_t i = 0; i < deviceKernels.size(); ++i) { + if (deviceKernels[i].has_value()) { + devices.push_back(deviceKernels[i].value().hDevice); + } + } + return devices; +} + +namespace ur::level_zero { +ur_result_t urKernelCreate(ur_program_handle_t hProgram, + const char *pKernelName, + ur_kernel_handle_t *phKernel) { + *phKernel = new ur_kernel_handle_t_(hProgram, pKernelName); + return UR_RESULT_SUCCESS; +} + +ur_result_t urKernelRetain( + ur_kernel_handle_t hKernel ///< [in] handle for the Kernel to retain +) { + hKernel->RefCount.increment(); + return UR_RESULT_SUCCESS; +} + +ur_result_t urKernelRelease( + ur_kernel_handle_t hKernel ///< [in] handle for the Kernel to release +) { + if (!hKernel->RefCount.decrementAndTest()) + return UR_RESULT_SUCCESS; + + hKernel->release(); + delete hKernel; + + return UR_RESULT_SUCCESS; +} + +ur_result_t urKernelSetArgValue( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + uint32_t argIndex, ///< [in] argument index in range [0, num args - 1] + size_t argSize, ///< [in] size of argument type + const ur_kernel_arg_value_properties_t + *pProperties, ///< [in][optional] argument properties + const void + *pArgValue ///< [in] argument value represented as matching arg type. +) { + TRACK_SCOPE_LATENCY("ur_kernel_handle_t_::setArgValue"); + return hKernel->setArgValue(argIndex, argSize, pProperties, pArgValue); +} + +ur_result_t urKernelSetArgPointer( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + uint32_t argIndex, ///< [in] argument index in range [0, num args - 1] + const ur_kernel_arg_pointer_properties_t + *pProperties, ///< [in][optional] argument properties + const void + *pArgValue ///< [in] argument value represented as matching arg type. +) { + TRACK_SCOPE_LATENCY("ur_kernel_handle_t_::setArgPointer"); + return hKernel->setArgPointer(argIndex, pProperties, pArgValue); +} + +ur_result_t +urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, + const ur_kernel_arg_mem_obj_properties_t *pProperties, + ur_mem_handle_t hArgValue) { + TRACK_SCOPE_LATENCY("ur_kernel_handle_t_::setArgMemObj"); + + // TODO: support properties + std::ignore = pProperties; + + auto kernelDevices = hKernel->getDevices(); + if (kernelDevices.size() == 1) { + auto zePtr = hArgValue->getDevicePtr( + kernelDevices.front(), ur_mem_handle_t_::access_mode_t::read_write, 0, + hArgValue->getSize(), nullptr); + return hKernel->setArgPointer(argIndex, nullptr, zePtr); + } else { + // TODO: if devices do not have p2p capabilities, we need to have allocation + // on each device. Do this the same way as in legacy (keep a pending Args + // vector and do actual allocation on kernel submission) or allocate the + // memory immediately (only for small allocations?). + + // Get memory that is accessible by the first device. + // If kernel is submitted to a different device the memory + // will be accessed trough the link or migrated in enqueueKernelLaunch. + auto zePtr = hArgValue->getDevicePtr( + kernelDevices.front(), ur_mem_handle_t_::access_mode_t::read_write, 0, + hArgValue->getSize(), nullptr); + return hKernel->setArgPointer(argIndex, nullptr, zePtr); + } +} + +ur_result_t +urKernelSetArgLocal(ur_kernel_handle_t hKernel, uint32_t argIndex, + size_t argSize, + const ur_kernel_arg_local_properties_t *pProperties) { + TRACK_SCOPE_LATENCY("ur_kernel_handle_t_::setArgLocal"); + + std::ignore = pProperties; + + return hKernel->setArgValue(argIndex, argSize, nullptr, nullptr); +} + +ur_result_t urKernelSetExecInfo( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + ur_kernel_exec_info_t propName, ///< [in] name of the execution attribute + size_t propSize, ///< [in] size in byte the attribute value + const ur_kernel_exec_info_properties_t + *pProperties, ///< [in][optional] pointer to execution info properties + const void *pPropValue ///< [in][range(0, propSize)] pointer to memory + ///< location holding the property value. +) { + std::ignore = propSize; + std::ignore = pProperties; + + return hKernel->setExecInfo(propName, pPropValue); +} + +ur_result_t urKernelGetGroupInfo( + ur_kernel_handle_t hKernel, ///< [in] handle of the Kernel object + ur_device_handle_t hDevice, ///< [in] handle of the Device object + ur_kernel_group_info_t + paramName, ///< [in] name of the work Group property to query + size_t + paramValueSize, ///< [in] size of the Kernel Work Group property value + void *pParamValue, ///< [in,out][optional][range(0, propSize)] value of the + ///< Kernel Work Group property. + size_t *pParamValueSizeRet ///< [out][optional] pointer to the actual size + ///< in bytes of data being queried by propName. +) { + UrReturnHelper returnValue(paramValueSize, pParamValue, pParamValueSizeRet); + + std::shared_lock Guard(hKernel->Mutex); + switch (paramName) { + case UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { + // TODO: To revisit after level_zero/issues/262 is resolved + struct { + size_t Arr[3]; + } GlobalWorkSize = {{(hDevice->ZeDeviceComputeProperties->maxGroupSizeX * + hDevice->ZeDeviceComputeProperties->maxGroupCountX), + (hDevice->ZeDeviceComputeProperties->maxGroupSizeY * + hDevice->ZeDeviceComputeProperties->maxGroupCountY), + (hDevice->ZeDeviceComputeProperties->maxGroupSizeZ * + hDevice->ZeDeviceComputeProperties->maxGroupCountZ)}}; + return returnValue(GlobalWorkSize); + } + case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { + ZeStruct workGroupProperties; + workGroupProperties.maxGroupSize = 0; + + ZeStruct kernelProperties; + kernelProperties.pNext = &workGroupProperties; + + auto zeDevice = hKernel->getZeHandle(hDevice); + auto zeResult = + ZE_CALL_NOCHECK(zeKernelGetProperties, (zeDevice, &kernelProperties)); + if (zeResult == ZE_RESULT_SUCCESS && + workGroupProperties.maxGroupSize != 0) { + return returnValue(workGroupProperties.maxGroupSize); + } + return returnValue( + uint64_t{hDevice->ZeDeviceComputeProperties->maxTotalGroupSize}); + } + case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { + auto props = hKernel->getProperties(hDevice); + struct { + size_t Arr[3]; + } WgSize = {{props.requiredGroupSizeX, props.requiredGroupSizeY, + props.requiredGroupSizeZ}}; + return returnValue(WgSize); + } + case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { + auto props = hKernel->getProperties(hDevice); + return returnValue(uint32_t{props.localMemSize}); + } + case UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { + return returnValue( + size_t{hDevice->ZeDeviceProperties->physicalEUSimdWidth}); + } + case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { + auto props = hKernel->getProperties(hDevice); + return returnValue(uint32_t{props.privateMemSize}); + } + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE: + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE: + // No corresponding enumeration in Level Zero + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + default: { + logger::error( + "Unknown ParamName in urKernelGetGroupInfo: ParamName={}(0x{})", + paramName, logger::toHex(paramName)); + return UR_RESULT_ERROR_INVALID_VALUE; + } + } + return UR_RESULT_SUCCESS; +} + +ur_result_t urKernelGetSubGroupInfo( + ur_kernel_handle_t hKernel, ///< [in] handle of the Kernel object + ur_device_handle_t hDevice, ///< [in] handle of the Device object + ur_kernel_sub_group_info_t + propName, ///< [in] name of the SubGroup property to query + size_t propSize, ///< [in] size of the Kernel SubGroup property value + void *pPropValue, ///< [in,out][range(0, propSize)][optional] value of the + ///< Kernel SubGroup property. + size_t *pPropSizeRet ///< [out][optional] pointer to the actual size in + ///< bytes of data being queried by propName. +) { + std::ignore = hDevice; + + UrReturnHelper returnValue(propSize, pPropValue, pPropSizeRet); + + auto props = hKernel->getProperties(hDevice); + + std::shared_lock Guard(hKernel->Mutex); + if (propName == UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE) { + returnValue(uint32_t{props.maxSubgroupSize}); + } else if (propName == UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS) { + returnValue(uint32_t{props.maxNumSubgroups}); + } else if (propName == UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS) { + returnValue(uint32_t{props.requiredNumSubGroups}); + } else if (propName == UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL) { + returnValue(uint32_t{props.requiredSubgroupSize}); + } else { + die("urKernelGetSubGroupInfo: parameter not implemented"); + return {}; + } + return UR_RESULT_SUCCESS; +} +} // namespace ur::level_zero diff --git a/source/adapters/level_zero/v2/kernel.hpp b/source/adapters/level_zero/v2/kernel.hpp new file mode 100644 index 0000000000..2d3a891826 --- /dev/null +++ b/source/adapters/level_zero/v2/kernel.hpp @@ -0,0 +1,81 @@ +//===--------- kernel.hpp - Level Zero Adapter ---------------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "../program.hpp" + +#include "common.hpp" + +struct ur_single_device_kernel_t { + ur_single_device_kernel_t(ur_device_handle_t hDevice, + ze_kernel_handle_t hKernel, bool ownZeHandle); + ur_result_t release(); + + ur_device_handle_t hDevice; + v2::raii::ze_kernel_handle_t hKernel; + mutable ZeCache> zeKernelProperties; +}; + +struct ur_kernel_handle_t_ : _ur_object { +private: +public: + ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *kernelName); + + // From native handle + ur_kernel_handle_t_(ur_native_handle_t hNativeKernel, + ur_program_handle_t hProgram, + const ur_kernel_native_properties_t *pProperties); + + // Get L0 kernel handle for a given device + ze_kernel_handle_t getZeHandle(ur_device_handle_t hDevice); + + // Get program handle of the kernel. + ur_program_handle_t getProgramHandle() const; + + // Get devices the kernel is built for. + std::vector getDevices() const; + + // Get name of the kernel. + const std::string &getName() const; + + // Get properties of the kernel. + const ze_kernel_properties_t &getProperties(ur_device_handle_t hDevice) const; + + // Implementation of urKernelSetArgValue. + ur_result_t setArgValue(uint32_t argIndex, size_t argSize, + const ur_kernel_arg_value_properties_t *pProperties, + const void *pArgValue); + + // Implementation of urKernelSetArgPointer. + ur_result_t + setArgPointer(uint32_t argIndex, + const ur_kernel_arg_pointer_properties_t *pProperties, + const void *pArgValue); + + // Implementation of urKernelSetExecInfo. + ur_result_t setExecInfo(ur_kernel_exec_info_t propName, + const void *pPropValue); + + // Perform cleanup. + ur_result_t release(); + +private: + // Keep the program of the kernel. + ur_program_handle_t hProgram; + + // Vector of ur_single_device_kernel_t indexed by device->Id + std::vector> deviceKernels; + + // Cache of the kernel name. + mutable ZeCache zeKernelName; + + void completeInitialization(); +}; diff --git a/source/adapters/level_zero/v2/memory.cpp b/source/adapters/level_zero/v2/memory.cpp new file mode 100644 index 0000000000..0b1bc63b03 --- /dev/null +++ b/source/adapters/level_zero/v2/memory.cpp @@ -0,0 +1,361 @@ +//===--------- memory.cpp - Level Zero Adapter ---------------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "memory.hpp" +#include "context.hpp" + +#include "../helpers/memory_helpers.hpp" + +ur_mem_handle_t_::ur_mem_handle_t_(ur_context_handle_t hContext, size_t size) + : hContext(hContext), size(size) {} + +ur_usm_handle_t_::ur_usm_handle_t_(ur_context_handle_t hContext, size_t size, + const void *ptr) + : ur_mem_handle_t_(hContext, size), ptr(const_cast(ptr)) {} + +ur_usm_handle_t_::~ur_usm_handle_t_() {} + +void *ur_usm_handle_t_::getDevicePtr( + ur_device_handle_t hDevice, access_mode_t access, size_t offset, + size_t size, std::function migrate) { + std::ignore = hDevice; + std::ignore = access; + std::ignore = offset; + std::ignore = size; + std::ignore = migrate; + return ptr; +} + +void *ur_usm_handle_t_::mapHostPtr( + access_mode_t access, size_t offset, size_t size, + std::function) { + std::ignore = access; + std::ignore = offset; + std::ignore = size; + return ptr; +} + +void ur_usm_handle_t_::unmapHostPtr( + void *pMappedPtr, std::function) { + std::ignore = pMappedPtr; + /* nop */ +} + +ur_integrated_mem_handle_t::ur_integrated_mem_handle_t( + ur_context_handle_t hContext, void *hostPtr, size_t size, + host_ptr_action_t hostPtrAction) + : ur_mem_handle_t_(hContext, size) { + bool hostPtrImported = false; + if (hostPtrAction == host_ptr_action_t::import) { + hostPtrImported = + maybeImportUSM(hContext->getPlatform()->ZeDriverHandleExpTranslated, + hContext->getZeHandle(), hostPtr, size); + } + + if (!hostPtrImported) { + UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate( + hContext, nullptr, nullptr, UR_USM_TYPE_HOST, size, &this->ptr)); + + if (hostPtr) { + std::memcpy(this->ptr, hostPtr, size); + } + } +} + +ur_integrated_mem_handle_t::~ur_integrated_mem_handle_t() { + if (ptr) { + auto ret = hContext->getDefaultUSMPool()->free(ptr); + if (ret != UR_RESULT_SUCCESS) { + logger::error("Failed to free host memory: {}", ret); + } + } +} + +void *ur_integrated_mem_handle_t::getDevicePtr( + ur_device_handle_t hDevice, access_mode_t access, size_t offset, + size_t size, std::function migrate) { + std::ignore = hDevice; + std::ignore = access; + std::ignore = offset; + std::ignore = size; + std::ignore = migrate; + return ptr; +} + +void *ur_integrated_mem_handle_t::mapHostPtr( + access_mode_t access, size_t offset, size_t size, + std::function migrate) { + std::ignore = access; + std::ignore = offset; + std::ignore = size; + std::ignore = migrate; + return ptr; +} + +void ur_integrated_mem_handle_t::unmapHostPtr( + void *pMappedPtr, std::function) { + std::ignore = pMappedPtr; + /* nop */ +} + +static ur_result_t synchronousZeCopy(ur_context_handle_t hContext, + ur_device_handle_t hDevice, void *dst, + const void *src, size_t size) { + auto commandList = hContext->commandListCache.getImmediateCommandList( + hDevice->ZeDevice, true, + hDevice + ->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute] + .ZeOrdinal, + ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL, + std::nullopt); + + ZE2UR_CALL(zeCommandListAppendMemoryCopy, + (commandList.get(), dst, src, size, nullptr, 0, nullptr)); + + return UR_RESULT_SUCCESS; +} + +ur_result_t +ur_discrete_mem_handle_t::migrateBufferTo(ur_device_handle_t hDevice, void *src, + size_t size) { + auto Id = hDevice->Id.value(); + + if (!deviceAllocations[Id]) { + UR_CALL(hContext->getDefaultUSMPool()->allocate(hContext, hDevice, nullptr, + UR_USM_TYPE_DEVICE, size, + &deviceAllocations[Id])); + } + + UR_CALL( + synchronousZeCopy(hContext, hDevice, deviceAllocations[Id], src, size)); + + activeAllocationDevice = hDevice; + + return UR_RESULT_SUCCESS; +} + +ur_discrete_mem_handle_t::ur_discrete_mem_handle_t(ur_context_handle_t hContext, + void *hostPtr, size_t size) + : ur_mem_handle_t_(hContext, size), + deviceAllocations(hContext->getPlatform()->getNumDevices()), + activeAllocationDevice(nullptr), hostAllocations() { + if (hostPtr) { + auto initialDevice = hContext->getDevices()[0]; + UR_CALL_THROWS(migrateBufferTo(initialDevice, hostPtr, size)); + } +} + +ur_discrete_mem_handle_t::~ur_discrete_mem_handle_t() { + for (auto &ptr : deviceAllocations) { + if (ptr) { + auto ret = hContext->getDefaultUSMPool()->free(ptr); + if (ret != UR_RESULT_SUCCESS) { + logger::error("Failed to free device memory: {}", ret); + } + } + } +} + +void *ur_discrete_mem_handle_t::getDevicePtrUnlocked( + ur_device_handle_t hDevice, access_mode_t access, size_t offset, + size_t size, std::function migrate) { + std::ignore = access; + std::ignore = size; + std::ignore = migrate; + + if (!activeAllocationDevice) { + UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate( + hContext, hDevice, nullptr, UR_USM_TYPE_DEVICE, getSize(), + &deviceAllocations[hDevice->Id.value()])); + activeAllocationDevice = hDevice; + } + + char *ptr; + if (activeAllocationDevice == hDevice) { + ptr = ur_cast(deviceAllocations[hDevice->Id.value()]); + return ptr + offset; + } + + auto &p2pDevices = hContext->getP2PDevices(hDevice); + auto p2pAccessible = std::find(p2pDevices.begin(), p2pDevices.end(), + activeAllocationDevice) != p2pDevices.end(); + + if (!p2pAccessible) { + // TODO: migrate buffer through the host + throw UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + // TODO: see if it's better to migrate the memory to the specified device + return ur_cast( + deviceAllocations[activeAllocationDevice->Id.value()]) + + offset; +} + +void *ur_discrete_mem_handle_t::getDevicePtr( + ur_device_handle_t hDevice, access_mode_t access, size_t offset, + size_t size, std::function migrate) { + std::lock_guard lock(this->Mutex); + return getDevicePtrUnlocked(hDevice, access, offset, size, migrate); +} + +void *ur_discrete_mem_handle_t::mapHostPtr( + access_mode_t access, size_t offset, size_t size, + std::function migrate) { + std::lock_guard lock(this->Mutex); + + // TODO: use async alloc? + + void *ptr; + UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate( + hContext, nullptr, nullptr, UR_USM_TYPE_HOST, size, &ptr)); + + hostAllocations.emplace_back(ptr, size, offset, access); + + if (activeAllocationDevice && access != access_mode_t::write_only) { + auto srcPtr = + ur_cast(deviceAllocations[activeAllocationDevice->Id.value()]) + + offset; + migrate(srcPtr, hostAllocations.back().ptr, size); + } + + return hostAllocations.back().ptr; +} + +void ur_discrete_mem_handle_t::unmapHostPtr( + void *pMappedPtr, + std::function migrate) { + std::lock_guard lock(this->Mutex); + + for (auto &hostAllocation : hostAllocations) { + if (hostAllocation.ptr == pMappedPtr) { + void *devicePtr = nullptr; + if (activeAllocationDevice) { + devicePtr = ur_cast( + deviceAllocations[activeAllocationDevice->Id.value()]) + + hostAllocation.offset; + } else if (hostAllocation.access != access_mode_t::write_invalidate) { + devicePtr = ur_cast(getDevicePtrUnlocked( + hContext->getDevices()[0], access_mode_t::read_only, + hostAllocation.offset, hostAllocation.size, migrate)); + } + + if (devicePtr) { + migrate(hostAllocation.ptr, devicePtr, hostAllocation.size); + } + + // TODO: use async free here? + UR_CALL_THROWS(hContext->getDefaultUSMPool()->free(hostAllocation.ptr)); + return; + } + } + + // No mapping found + throw UR_RESULT_ERROR_INVALID_ARGUMENT; +} + +namespace ur::level_zero { +ur_result_t urMemBufferCreate(ur_context_handle_t hContext, + ur_mem_flags_t flags, size_t size, + const ur_buffer_properties_t *pProperties, + ur_mem_handle_t *phBuffer) { + if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) { + // TODO: + // Having PI_MEM_FLAGS_HOST_PTR_ALLOC for buffer requires allocation of + // pinned host memory, see: + // sycl/doc/extensions/supported/sycl_ext_oneapi_use_pinned_host_memory_property.asciidoc + // We are however missing such functionality in Level Zero, so we just + // ignore the flag for now. + } + + void *hostPtr = pProperties ? pProperties->pHost : nullptr; + + // We treat integrated devices (physical memory shared with the CPU) + // differently from discrete devices (those with distinct memories). + // For integrated devices, allocating the buffer in the host memory + // enables automatic access from the device, and makes copying + // unnecessary in the map/unmap operations. This improves performance. + bool useHostBuffer = hContext->getDevices().size() == 1 && + hContext->getDevices()[0]->ZeDeviceProperties->flags & + ZE_DEVICE_PROPERTY_FLAG_INTEGRATED; + + if (useHostBuffer) { + // TODO: assert that if hostPtr is set, either UR_MEM_FLAG_USE_HOST_POINTER + // or UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER is set? + auto hostPtrAction = + flags & UR_MEM_FLAG_USE_HOST_POINTER + ? ur_integrated_mem_handle_t::host_ptr_action_t::import + : ur_integrated_mem_handle_t::host_ptr_action_t::copy; + *phBuffer = + new ur_integrated_mem_handle_t(hContext, hostPtr, size, hostPtrAction); + } else { + *phBuffer = new ur_discrete_mem_handle_t(hContext, hostPtr, size); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t urMemBufferPartition(ur_mem_handle_t hBuffer, ur_mem_flags_t flags, + ur_buffer_create_type_t bufferCreateType, + const ur_buffer_region_t *pRegion, + ur_mem_handle_t *phMem) { + std::ignore = hBuffer; + std::ignore = flags; + std::ignore = bufferCreateType; + std::ignore = pRegion; + std::ignore = phMem; + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urMemBufferCreateWithNativeHandle( + ur_native_handle_t hNativeMem, ur_context_handle_t hContext, + const ur_mem_native_properties_t *pProperties, ur_mem_handle_t *phMem) { + std::ignore = hNativeMem; + std::ignore = hContext; + std::ignore = pProperties; + std::ignore = phMem; + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urMemGetInfo(ur_mem_handle_t hMemory, ur_mem_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet) { + std::shared_lock Lock(hMemory->Mutex); + UrReturnHelper returnValue(propSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_MEM_INFO_CONTEXT: { + return returnValue(hMemory->getContext()); + } + case UR_MEM_INFO_SIZE: { + // Get size of the allocation + return returnValue(size_t{hMemory->getSize()}); + } + default: { + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t urMemRetain(ur_mem_handle_t hMem) { + hMem->RefCount.increment(); + return UR_RESULT_SUCCESS; +} + +ur_result_t urMemRelease(ur_mem_handle_t hMem) { + if (hMem->RefCount.decrementAndTest()) { + delete hMem; + } + return UR_RESULT_SUCCESS; +} +} // namespace ur::level_zero diff --git a/source/adapters/level_zero/v2/memory.hpp b/source/adapters/level_zero/v2/memory.hpp new file mode 100644 index 0000000000..863df8ea73 --- /dev/null +++ b/source/adapters/level_zero/v2/memory.hpp @@ -0,0 +1,127 @@ +//===--------- memory.hpp - Level Zero Adapter ---------------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#include "../device.hpp" +#include "common.hpp" + +struct ur_mem_handle_t_ : _ur_object { + ur_mem_handle_t_(ur_context_handle_t hContext, size_t size); + virtual ~ur_mem_handle_t_() = default; + + enum class access_mode_t { + read_write, + read_only, + write_only, + write_invalidate + }; + + virtual void * + getDevicePtr(ur_device_handle_t, access_mode_t, size_t offset, size_t size, + std::function mecmpy) = 0; + virtual void * + mapHostPtr(access_mode_t, size_t offset, size_t size, + std::function memcpy) = 0; + virtual void + unmapHostPtr(void *pMappedPtr, + std::function memcpy) = 0; + + inline size_t getSize() { return size; } + inline ur_context_handle_t getContext() { return hContext; } + +protected: + const ur_context_handle_t hContext; + const size_t size; +}; + +struct ur_usm_handle_t_ : ur_mem_handle_t_ { + ur_usm_handle_t_(ur_context_handle_t hContext, size_t size, const void *ptr); + ~ur_usm_handle_t_(); + + void * + getDevicePtr(ur_device_handle_t, access_mode_t, size_t offset, size_t size, + std::function) override; + void *mapHostPtr(access_mode_t, size_t offset, size_t size, + std::function) override; + void unmapHostPtr(void *pMappedPtr, + std::function) override; + +private: + void *ptr; +}; + +// Manages memory buffer for integrated GPU. +// For integrated devices the buffer has been allocated in host memory +// and can be accessed by the device without copying. +struct ur_integrated_mem_handle_t : public ur_mem_handle_t_ { + enum class host_ptr_action_t { import, copy }; + + ur_integrated_mem_handle_t(ur_context_handle_t hContext, void *hostPtr, + size_t size, host_ptr_action_t useHostPtr); + ~ur_integrated_mem_handle_t(); + + void * + getDevicePtr(ur_device_handle_t, access_mode_t, size_t offset, size_t size, + std::function) override; + void *mapHostPtr(access_mode_t, size_t offset, size_t size, + std::function) override; + void unmapHostPtr(void *pMappedPtr, + std::function) override; + +private: + void *ptr; +}; + +struct host_allocation_desc_t { + host_allocation_desc_t(void *ptr, size_t size, size_t offset, + ur_mem_handle_t_::access_mode_t access) + : ptr(ptr), size(size), offset(offset), access(access) {} + + void *ptr; + size_t size; + size_t offset; + ur_mem_handle_t_::access_mode_t access; +}; + +// Manages memory buffer for discrete GPU. +// Memory is allocated on the device and migrated/copies if necessary. +struct ur_discrete_mem_handle_t : public ur_mem_handle_t_ { + ur_discrete_mem_handle_t(ur_context_handle_t hContext, void *hostPtr, + size_t size); + ~ur_discrete_mem_handle_t(); + + void * + getDevicePtr(ur_device_handle_t, access_mode_t, size_t offset, size_t size, + std::function) override; + void *mapHostPtr(access_mode_t, size_t offset, size_t size, + std::function) override; + void unmapHostPtr(void *pMappedPtr, + std::function) override; + +private: + void *getDevicePtrUnlocked(ur_device_handle_t, access_mode_t, size_t offset, + size_t size, + std::function); + + // Vector of per-device allocations indexed by device->Id + std::vector deviceAllocations; + + // Specifies device on which the latest allocation resides. + // If null, there is no allocation. + ur_device_handle_t activeAllocationDevice; + + std::vector hostAllocations; + + ur_result_t migrateBufferTo(ur_device_handle_t hDevice, void *src, + size_t size); +}; diff --git a/source/adapters/level_zero/queue_api.cpp b/source/adapters/level_zero/v2/queue_api.cpp similarity index 60% rename from source/adapters/level_zero/queue_api.cpp rename to source/adapters/level_zero/v2/queue_api.cpp index 4463500336..ea2e931bfe 100644 --- a/source/adapters/level_zero/queue_api.cpp +++ b/source/adapters/level_zero/v2/queue_api.cpp @@ -14,31 +14,30 @@ ur_queue_handle_t_::~ur_queue_handle_t_() {} -UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue, - ur_queue_info_t propName, - size_t propSize, - void *pPropValue, - size_t *pPropSizeRet) { +namespace ur::level_zero { +ur_result_t urQueueGetInfo(ur_queue_handle_t hQueue, ur_queue_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet) { return hQueue->queueGetInfo(propName, propSize, pPropValue, pPropSizeRet); } -UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) { +ur_result_t urQueueRetain(ur_queue_handle_t hQueue) { return hQueue->queueRetain(); } -UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) { +ur_result_t urQueueRelease(ur_queue_handle_t hQueue) { return hQueue->queueRelease(); } -UR_APIEXPORT ur_result_t UR_APICALL -urQueueGetNativeHandle(ur_queue_handle_t hQueue, ur_queue_native_desc_t *pDesc, - ur_native_handle_t *phNativeQueue) { +ur_result_t urQueueGetNativeHandle(ur_queue_handle_t hQueue, + ur_queue_native_desc_t *pDesc, + ur_native_handle_t *phNativeQueue) { return hQueue->queueGetNativeHandle(pDesc, phNativeQueue); } -UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) { +ur_result_t urQueueFinish(ur_queue_handle_t hQueue) { return hQueue->queueFinish(); } -UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t hQueue) { +ur_result_t urQueueFlush(ur_queue_handle_t hQueue) { return hQueue->queueFlush(); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( +ur_result_t urEnqueueKernelLaunch( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, @@ -47,27 +46,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( - ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { +ur_result_t urEnqueueEventsWait(ur_queue_handle_t hQueue, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { return hQueue->enqueueEventsWait(numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( +ur_result_t urEnqueueEventsWaitWithBarrier( ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { return hQueue->enqueueEventsWaitWithBarrier(numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( - ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead, - size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { +ur_result_t urEnqueueMemBufferRead(ur_queue_handle_t hQueue, + ur_mem_handle_t hBuffer, bool blockingRead, + size_t offset, size_t size, void *pDst, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { return hQueue->enqueueMemBufferRead(hBuffer, blockingRead, offset, size, pDst, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( +ur_result_t urEnqueueMemBufferWrite( ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite, size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { @@ -75,7 +77,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( pSrc, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( +ur_result_t urEnqueueMemBufferReadRect( ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead, ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, @@ -87,7 +89,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( +ur_result_t urEnqueueMemBufferWriteRect( ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite, ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, @@ -99,16 +101,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( - ur_queue_handle_t hQueue, ur_mem_handle_t hBufferSrc, - ur_mem_handle_t hBufferDst, size_t srcOffset, size_t dstOffset, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { +ur_result_t urEnqueueMemBufferCopy(ur_queue_handle_t hQueue, + ur_mem_handle_t hBufferSrc, + ur_mem_handle_t hBufferDst, size_t srcOffset, + size_t dstOffset, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { return hQueue->enqueueMemBufferCopy(hBufferSrc, hBufferDst, srcOffset, dstOffset, size, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( +ur_result_t urEnqueueMemBufferCopyRect( ur_queue_handle_t hQueue, ur_mem_handle_t hBufferSrc, ur_mem_handle_t hBufferDst, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, @@ -120,16 +124,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( srcSlicePitch, dstRowPitch, dstSlicePitch, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( - ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, const void *pPattern, - size_t patternSize, size_t offset, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { +ur_result_t urEnqueueMemBufferFill(ur_queue_handle_t hQueue, + ur_mem_handle_t hBuffer, + const void *pPattern, size_t patternSize, + size_t offset, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { return hQueue->enqueueMemBufferFill(hBuffer, pPattern, patternSize, offset, size, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( +ur_result_t urEnqueueMemImageRead( ur_queue_handle_t hQueue, ur_mem_handle_t hImage, bool blockingRead, ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pDst, uint32_t numEventsInWaitList, @@ -138,7 +144,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( hImage, blockingRead, origin, region, rowPitch, slicePitch, pDst, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( +ur_result_t urEnqueueMemImageWrite( ur_queue_handle_t hQueue, ur_mem_handle_t hImage, bool blockingWrite, ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pSrc, uint32_t numEventsInWaitList, @@ -147,78 +153,85 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( hImage, blockingWrite, origin, region, rowPitch, slicePitch, pSrc, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( - ur_queue_handle_t hQueue, ur_mem_handle_t hImageSrc, - ur_mem_handle_t hImageDst, ur_rect_offset_t srcOrigin, - ur_rect_offset_t dstOrigin, ur_rect_region_t region, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { +ur_result_t +urEnqueueMemImageCopy(ur_queue_handle_t hQueue, ur_mem_handle_t hImageSrc, + ur_mem_handle_t hImageDst, ur_rect_offset_t srcOrigin, + ur_rect_offset_t dstOrigin, ur_rect_region_t region, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { return hQueue->enqueueMemImageCopy(hImageSrc, hImageDst, srcOrigin, dstOrigin, region, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( - ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingMap, - ur_map_flags_t mapFlags, size_t offset, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent, void **ppRetMap) { +ur_result_t urEnqueueMemBufferMap(ur_queue_handle_t hQueue, + ur_mem_handle_t hBuffer, bool blockingMap, + ur_map_flags_t mapFlags, size_t offset, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent, void **ppRetMap) { return hQueue->enqueueMemBufferMap(hBuffer, blockingMap, mapFlags, offset, size, numEventsInWaitList, phEventWaitList, phEvent, ppRetMap); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( - ur_queue_handle_t hQueue, ur_mem_handle_t hMem, void *pMappedPtr, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { +ur_result_t urEnqueueMemUnmap(ur_queue_handle_t hQueue, ur_mem_handle_t hMem, + void *pMappedPtr, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { return hQueue->enqueueMemUnmap(hMem, pMappedPtr, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( - ur_queue_handle_t hQueue, void *pMem, size_t patternSize, - const void *pPattern, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { +ur_result_t urEnqueueUSMFill(ur_queue_handle_t hQueue, void *pMem, + size_t patternSize, const void *pPattern, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { return hQueue->enqueueUSMFill(pMem, patternSize, pPattern, size, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( - ur_queue_handle_t hQueue, bool blocking, void *pDst, const void *pSrc, - size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { +ur_result_t urEnqueueUSMMemcpy(ur_queue_handle_t hQueue, bool blocking, + void *pDst, const void *pSrc, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { return hQueue->enqueueUSMMemcpy(blocking, pDst, pSrc, size, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( - ur_queue_handle_t hQueue, const void *pMem, size_t size, - ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { +ur_result_t urEnqueueUSMPrefetch(ur_queue_handle_t hQueue, const void *pMem, + size_t size, ur_usm_migration_flags_t flags, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { return hQueue->enqueueUSMPrefetch(pMem, size, flags, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL -urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, - ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) { +ur_result_t urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, + size_t size, ur_usm_advice_flags_t advice, + ur_event_handle_t *phEvent) { return hQueue->enqueueUSMAdvise(pMem, size, advice, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill2D( - ur_queue_handle_t hQueue, void *pMem, size_t pitch, size_t patternSize, - const void *pPattern, size_t width, size_t height, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { +ur_result_t urEnqueueUSMFill2D(ur_queue_handle_t hQueue, void *pMem, + size_t pitch, size_t patternSize, + const void *pPattern, size_t width, + size_t height, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { return hQueue->enqueueUSMFill2D(pMem, pitch, patternSize, pPattern, width, height, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( - ur_queue_handle_t hQueue, bool blocking, void *pDst, size_t dstPitch, - const void *pSrc, size_t srcPitch, size_t width, size_t height, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { +ur_result_t urEnqueueUSMMemcpy2D(ur_queue_handle_t hQueue, bool blocking, + void *pDst, size_t dstPitch, const void *pSrc, + size_t srcPitch, size_t width, size_t height, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { return hQueue->enqueueUSMMemcpy2D(blocking, pDst, dstPitch, pSrc, srcPitch, width, height, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( +ur_result_t urEnqueueDeviceGlobalVariableWrite( ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name, bool blockingWrite, size_t count, size_t offset, const void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, @@ -227,7 +240,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( hProgram, name, blockingWrite, count, offset, pSrc, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( +ur_result_t urEnqueueDeviceGlobalVariableRead( ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name, bool blockingRead, size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, @@ -236,25 +249,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( hProgram, name, blockingRead, count, offset, pDst, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueReadHostPipe( - ur_queue_handle_t hQueue, ur_program_handle_t hProgram, - const char *pipe_symbol, bool blocking, void *pDst, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { +ur_result_t urEnqueueReadHostPipe(ur_queue_handle_t hQueue, + ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, + void *pDst, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { return hQueue->enqueueReadHostPipe(hProgram, pipe_symbol, blocking, pDst, size, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe( - ur_queue_handle_t hQueue, ur_program_handle_t hProgram, - const char *pipe_symbol, bool blocking, void *pSrc, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { +ur_result_t urEnqueueWriteHostPipe(ur_queue_handle_t hQueue, + ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, + void *pSrc, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { return hQueue->enqueueWriteHostPipe(hProgram, pipe_symbol, blocking, pSrc, size, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( +ur_result_t urBindlessImagesImageCopyExp( ur_queue_handle_t hQueue, const void *pSrc, void *pDst, const ur_image_desc_t *pSrcImageDesc, const ur_image_desc_t *pDstImageDesc, const ur_image_format_t *pSrcImageFormat, @@ -267,23 +284,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( pDstImageFormat, pCopyRegion, imageCopyFlags, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( - ur_queue_handle_t hQueue, ur_exp_interop_semaphore_handle_t hSemaphore, +ur_result_t urBindlessImagesWaitExternalSemaphoreExp( + ur_queue_handle_t hQueue, ur_exp_external_semaphore_handle_t hSemaphore, bool hasWaitValue, uint64_t waitValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { return hQueue->bindlessImagesWaitExternalSemaphoreExp( hSemaphore, hasWaitValue, waitValue, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( - ur_queue_handle_t hQueue, ur_exp_interop_semaphore_handle_t hSemaphore, +ur_result_t urBindlessImagesSignalExternalSemaphoreExp( + ur_queue_handle_t hQueue, ur_exp_external_semaphore_handle_t hSemaphore, bool hasSignalValue, uint64_t signalValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { return hQueue->bindlessImagesSignalExternalSemaphoreExp( hSemaphore, hasSignalValue, signalValue, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( +ur_result_t urEnqueueCooperativeKernelLaunchExp( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, @@ -292,13 +309,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( +ur_result_t urEnqueueTimestampRecordingExp( ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { return hQueue->enqueueTimestampRecordingExp(blocking, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( +ur_result_t urEnqueueKernelLaunchCustomExp( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, @@ -310,7 +327,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( numPropsInLaunchPropList, launchPropList, numEventsInWaitList, phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( +ur_result_t urEnqueueNativeCommandExp( ur_queue_handle_t hQueue, ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data, uint32_t numMemsInMemList, const ur_mem_handle_t *phMemList, @@ -321,3 +338,4 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( pfnNativeEnqueue, data, numMemsInMemList, phMemList, pProperties, numEventsInWaitList, phEventWaitList, phEvent); } +} // namespace ur::level_zero \ No newline at end of file diff --git a/source/adapters/level_zero/queue_api.hpp b/source/adapters/level_zero/v2/queue_api.hpp similarity index 98% rename from source/adapters/level_zero/queue_api.hpp rename to source/adapters/level_zero/v2/queue_api.hpp index d5c61c3ab0..bc01596d2b 100644 --- a/source/adapters/level_zero/queue_api.hpp +++ b/source/adapters/level_zero/v2/queue_api.hpp @@ -128,10 +128,10 @@ struct ur_queue_handle_t_ { ur_exp_image_copy_region_t *, ur_exp_image_copy_flags_t, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) = 0; virtual ur_result_t bindlessImagesWaitExternalSemaphoreExp( - ur_exp_interop_semaphore_handle_t, bool, uint64_t, uint32_t, + ur_exp_external_semaphore_handle_t, bool, uint64_t, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) = 0; virtual ur_result_t bindlessImagesSignalExternalSemaphoreExp( - ur_exp_interop_semaphore_handle_t, bool, uint64_t, uint32_t, + ur_exp_external_semaphore_handle_t, bool, uint64_t, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) = 0; virtual ur_result_t enqueueCooperativeKernelLaunchExp( ur_kernel_handle_t, uint32_t, const size_t *, const size_t *, diff --git a/source/adapters/level_zero/v2/queue_create.cpp b/source/adapters/level_zero/v2/queue_create.cpp new file mode 100644 index 0000000000..ec9182f5ef --- /dev/null +++ b/source/adapters/level_zero/v2/queue_create.cpp @@ -0,0 +1,47 @@ +/* + * + * Copyright (C) 2024 Intel Corporation + * + * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM + * Exceptions. See LICENSE.TXT + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * @file queue_api.cpp + * + */ + +#include "logger/ur_logger.hpp" +#include "queue_api.hpp" +#include "queue_immediate_in_order.hpp" + +#include +#include + +namespace ur::level_zero { +ur_result_t urQueueCreate(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + const ur_queue_properties_t *pProperties, + ur_queue_handle_t *phQueue) { + if (!hContext->isValidDevice(hDevice)) { + return UR_RESULT_ERROR_INVALID_DEVICE; + } + + // TODO: For now, always use immediate, in-order + *phQueue = + new v2::ur_queue_immediate_in_order_t(hContext, hDevice, pProperties); + return UR_RESULT_SUCCESS; +} + +ur_result_t urQueueCreateWithNativeHandle( + ur_native_handle_t hNativeQueue, ur_context_handle_t hContext, + ur_device_handle_t hDevice, const ur_queue_native_properties_t *pProperties, + ur_queue_handle_t *phQueue) { + std::ignore = hNativeQueue; + std::ignore = hContext; + std::ignore = hDevice; + std::ignore = pProperties; + std::ignore = phQueue; + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} +} // namespace ur::level_zero diff --git a/source/adapters/level_zero/v2/queue_factory.hpp b/source/adapters/level_zero/v2/queue_factory.hpp deleted file mode 100644 index 774da9aa89..0000000000 --- a/source/adapters/level_zero/v2/queue_factory.hpp +++ /dev/null @@ -1,40 +0,0 @@ -//===--------- queue_factory.cpp - Level Zero Adapter --------------------===// -// -// Copyright (C) 2024 Intel Corporation -// -// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM -// Exceptions. See LICENSE.TXT -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include "../queue.hpp" -#include "context.hpp" - -#include "queue_immediate_in_order.hpp" - -namespace v2 { - -inline bool shouldUseQueueV2(ur_device_handle_t hDevice, - ur_queue_flags_t flags) { - std::ignore = hDevice; - std::ignore = flags; - - const char *UrRet = std::getenv("UR_L0_USE_QUEUE_V2"); - return UrRet && std::stoi(UrRet); -} - -inline ur_queue_handle_t createQueue(::ur_context_handle_t hContext, - ur_device_handle_t hDevice, - const ur_queue_properties_t *pProps) { - if (!shouldUseQueueV2(hDevice, pProps ? pProps->flags : ur_queue_flags_t{})) { - throw UR_RESULT_ERROR_INVALID_ARGUMENT; - } - // TODO: For now, always use immediate, in-order - return new ur_queue_immediate_in_order_t( - static_cast(hContext), hDevice, pProps); -} - -} // namespace v2 diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index 8442378972..b68af85033 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -9,9 +9,40 @@ //===----------------------------------------------------------------------===// #include "queue_immediate_in_order.hpp" +#include "kernel.hpp" +#include "memory.hpp" +#include "ur.hpp" + +#include "../common/latency_tracker.hpp" +#include "../helpers/kernel_helpers.hpp" +#include "../helpers/memory_helpers.hpp" +#include "../program.hpp" +#include "../ur_interface_loader.hpp" namespace v2 { +std::pair +ur_queue_immediate_in_order_t::getWaitListView( + const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents, + ur_command_list_handler_t *pHandler) { + auto extraWaitEvent = (lastHandler && pHandler != lastHandler) + ? lastHandler->lastEvent->getZeEvent() + : nullptr; + + auto totalEvents = numWaitEvents + (extraWaitEvent != nullptr); + waitList.reserve(totalEvents); + + for (uint32_t i = 0; i < numWaitEvents; i++) { + waitList[i] = phWaitEvents[i]->getZeEvent(); + } + + if (extraWaitEvent) { + waitList[numWaitEvents] = extraWaitEvent; + } + + return {waitList.data(), static_cast(totalEvents)}; +} + static int32_t getZeOrdinal(ur_device_handle_t hDevice, queue_group_type type) { if (type == queue_group_type::MainCopy && hDevice->hasMainCopyEngine()) { return hDevice->QueueGroup[queue_group_type::MainCopy].ZeOrdinal; @@ -41,29 +72,102 @@ static ze_command_queue_priority_t getZePriority(ur_queue_flags_t flags) { } ur_command_list_handler_t::ur_command_list_handler_t( - v2::ur_context_handle_t hContext, ur_device_handle_t hDevice, - const ur_queue_properties_t *pProps, queue_group_type type) + ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_queue_properties_t *pProps, queue_group_type type, + event_pool *eventPool) : commandList(hContext->commandListCache.getImmediateCommandList( hDevice->ZeDevice, true, getZeOrdinal(hDevice, type), ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, getZePriority(pProps ? pProps->flags : ur_queue_flags_t{}), - getZeIndex(pProps))) {} + getZeIndex(pProps))), + internalEvent(eventPool->allocate(), [=](ur_event_handle_t event) { + ur::level_zero::urEventRelease(event); + }) {} + +static event_flags_t eventFlagsFromQueueFlags(ur_queue_flags_t flags) { + event_flags_t eventFlags = EVENT_FLAGS_COUNTER; + if (flags & UR_QUEUE_FLAG_PROFILING_ENABLE) + eventFlags |= EVENT_FLAGS_PROFILING_ENABLED; + return eventFlags; +} ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( - v2::ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_queue_properties_t *pProps) - : copyHandler(hContext, hDevice, pProps, queue_group_type::MainCopy), - computeHandler(hContext, hDevice, pProps, queue_group_type::Compute) {} + : hContext(hContext), hDevice(hDevice), flags(pProps ? pProps->flags : 0), + eventPool(hContext->eventPoolCache.borrow( + hDevice->Id.value(), eventFlagsFromQueueFlags(flags))), + copyHandler(hContext, hDevice, pProps, queue_group_type::MainCopy, + eventPool.get()), + computeHandler(hContext, hDevice, pProps, queue_group_type::Compute, + eventPool.get()) {} + +ur_command_list_handler_t * +ur_queue_immediate_in_order_t::getCommandListHandlerForCompute() { + return &computeHandler; +} + +ur_command_list_handler_t * +ur_queue_immediate_in_order_t::getCommandListHandlerForCopy() { + // TODO: optimize for specific devices, see ../memory.cpp + return ©Handler; +} + +ur_command_list_handler_t * +ur_queue_immediate_in_order_t::getCommandListHandlerForFill( + size_t patternSize) { + if (patternSize <= hDevice->QueueGroup[queue_group_type::MainCopy] + .ZeProperties.maxMemoryFillPatternSize) + return ©Handler; + else + return &computeHandler; +} + +ur_event_handle_t ur_queue_immediate_in_order_t::getSignalEvent( + ur_command_list_handler_t *handler, ur_event_handle_t *hUserEvent) { + if (!hUserEvent) { + handler->lastEvent = handler->internalEvent.get(); + } else { + *hUserEvent = eventPool->allocate(); + handler->lastEvent = *hUserEvent; + } + + return handler->lastEvent; +} ur_result_t ur_queue_immediate_in_order_t::queueGetInfo(ur_queue_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet) { - std::ignore = propName; - std::ignore = propSize; - std::ignore = pPropValue; - std::ignore = pPropSizeRet; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + std::shared_lock Lock(Mutex); + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + // TODO: consider support for queue properties and size + switch ((uint32_t)propName) { // cast to avoid warnings on EXT enum values + case UR_QUEUE_INFO_CONTEXT: + return ReturnValue(hContext); + case UR_QUEUE_INFO_DEVICE: + return ReturnValue(hDevice); + case UR_QUEUE_INFO_REFERENCE_COUNT: + return ReturnValue(uint32_t{RefCount.load()}); + case UR_QUEUE_INFO_FLAGS: + return ReturnValue(flags); + case UR_QUEUE_INFO_SIZE: + case UR_QUEUE_INFO_DEVICE_DEFAULT: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + case UR_QUEUE_INFO_EMPTY: { + // We can exit early if we have in-order queue. + if (!lastHandler) + return ReturnValue(true); + [[fallthrough]]; + } + default: + logger::error( + "Unsupported ParamName in urQueueGetInfo: ParamName=ParamName={}(0x{})", + propName, logger::toHex(propName)); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + return UR_RESULT_SUCCESS; } ur_result_t ur_queue_immediate_in_order_t::queueRetain() { @@ -86,12 +190,47 @@ ur_result_t ur_queue_immediate_in_order_t::queueGetNativeHandle( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +ur_result_t ur_queue_immediate_in_order_t::finalizeHandler( + ur_command_list_handler_t *handler) { + lastHandler = handler; + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_immediate_in_order_t::finalizeHandler( + ur_command_list_handler_t *handler, bool blocking) { + if (blocking) { + ZE2UR_CALL(zeCommandListHostSynchronize, + (handler->commandList.get(), UINT64_MAX)); + lastHandler = nullptr; + } else { + finalizeHandler(handler); + } + + return UR_RESULT_SUCCESS; +} + ur_result_t ur_queue_immediate_in_order_t::queueFinish() { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::queueFinish"); + std::unique_lock lock(this->Mutex); + + if (!lastHandler) { + return UR_RESULT_SUCCESS; + } + + auto lastCmdList = lastHandler->commandList.get(); + lastHandler = nullptr; + lock.unlock(); + + // TODO: use zeEventHostSynchronize instead? + TRACK_SCOPE_LATENCY( + "ur_queue_immediate_in_order_t::zeCommandListHostSynchronize"); + ZE2UR_CALL(zeCommandListHostSynchronize, (lastCmdList, UINT64_MAX)); + + return UR_RESULT_SUCCESS; } ur_result_t ur_queue_immediate_in_order_t::queueFlush() { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + return UR_RESULT_SUCCESS; } ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunch( @@ -99,63 +238,200 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunch( const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = hKernel; - std::ignore = workDim; - std::ignore = pGlobalWorkOffset; - std::ignore = pGlobalWorkSize; - std::ignore = pLocalWorkSize; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueKernelLaunch"); + + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER); + + UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + + ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(hDevice); + + std::scoped_lock Lock( + hKernel->Mutex, hKernel->getProgramHandle()->Mutex, this->Mutex); + + if (pGlobalWorkOffset != NULL) { + UR_CALL(setKernelGlobalOffset(hContext, hZeKernel, pGlobalWorkOffset)); + } + + ze_group_count_t zeThreadGroupDimensions{1, 1, 1}; + uint32_t WG[3]; + UR_CALL(calculateKernelWorkDimensions(hZeKernel, hDevice, + zeThreadGroupDimensions, WG, workDim, + pGlobalWorkSize, pLocalWorkSize)); + + ZE2UR_CALL(zeKernelSetGroupSize, (hZeKernel, WG[0], WG[1], WG[2])); + + auto handler = getCommandListHandlerForCompute(); + auto signalEvent = getSignalEvent(handler, phEvent); + + auto [pWaitEvents, numWaitEvents] = + getWaitListView(phEventWaitList, numEventsInWaitList, handler); + + // TODO: consider migrating memory to the device if memory buffers are used + + TRACK_SCOPE_LATENCY( + "ur_queue_immediate_in_order_t::zeCommandListAppendLaunchKernel"); + ZE2UR_CALL(zeCommandListAppendLaunchKernel, + (handler->commandList.get(), hZeKernel, &zeThreadGroupDimensions, + signalEvent->getZeEvent(), numWaitEvents, pWaitEvents)); + + return finalizeHandler(handler); } ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWait( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueEventsWait"); + + std::unique_lock lock(this->Mutex); + + auto handler = getCommandListHandlerForCompute(); + auto signalEvent = getSignalEvent(handler, phEvent); + auto [pWaitEvents, numWaitEvents] = + getWaitListView(phEventWaitList, numEventsInWaitList, handler); + + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (handler->commandList.get(), numWaitEvents, pWaitEvents)); + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (handler->commandList.get(), signalEvent->getZeEvent())); + + return finalizeHandler(handler); } ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrier( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + // For in-order queue we don't need a real barrier, just wait for + // requested events in potentially different queues and add a "barrier" + // event signal because it is already guaranteed that previous commands + // in this queue are completed when the signal is started. + return enqueueEventsWait(numEventsInWaitList, phEventWaitList, phEvent); +} + +ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCopyUnlocked( + ur_mem_handle_t src, ur_mem_handle_t dst, bool blocking, size_t srcOffset, + size_t dstOffset, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + auto handler = getCommandListHandlerForCopy(); + auto signalEvent = getSignalEvent(handler, phEvent); + + auto waitList = + getWaitListView(phEventWaitList, numEventsInWaitList, handler); + + bool memoryMigrated = false; + auto pSrc = ur_cast(src->getDevicePtr( + hDevice, ur_mem_handle_t_::access_mode_t::read_only, srcOffset, size, + [&](void *src, void *dst, size_t size) { + ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, + (handler->commandList.get(), dst, src, size, nullptr, + waitList.second, waitList.first)); + memoryMigrated = true; + })); + + auto pDst = ur_cast(dst->getDevicePtr( + hDevice, ur_mem_handle_t_::access_mode_t::write_only, dstOffset, size, + [&](void *src, void *dst, size_t size) { + ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, + (handler->commandList.get(), dst, src, size, nullptr, + waitList.second, waitList.first)); + memoryMigrated = true; + })); + + if (memoryMigrated) { + // If memory was migrated, we don't need to pass the wait list to + // the copy command again. + waitList.first = nullptr; + waitList.second = 0; + } + + ZE2UR_CALL(zeCommandListAppendMemoryCopy, + (handler->commandList.get(), pDst, pSrc, size, + signalEvent->getZeEvent(), waitList.second, waitList.first)); + + return finalizeHandler(handler, blocking); } ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferRead( ur_mem_handle_t hBuffer, bool blockingRead, size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = hBuffer; - std::ignore = blockingRead; - std::ignore = offset; - std::ignore = size; - std::ignore = pDst; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemBufferRead"); + + UR_ASSERT(offset + size <= hBuffer->getSize(), UR_RESULT_ERROR_INVALID_SIZE); + + std::scoped_lock Lock(this->Mutex); + + ur_usm_handle_t_ dstHandle(hContext, size, pDst); + return enqueueGenericCopyUnlocked(hBuffer, &dstHandle, blockingRead, offset, + 0, size, numEventsInWaitList, + phEventWaitList, phEvent); } ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferWrite( ur_mem_handle_t hBuffer, bool blockingWrite, size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = hBuffer; - std::ignore = blockingWrite; - std::ignore = offset; - std::ignore = size; - std::ignore = pSrc; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemBufferWrite"); + + UR_ASSERT(offset + size <= hBuffer->getSize(), UR_RESULT_ERROR_INVALID_SIZE); + + std::scoped_lock Lock(this->Mutex); + + ur_usm_handle_t_ srcHandle(hContext, size, pSrc); + return enqueueGenericCopyUnlocked(&srcHandle, hBuffer, blockingWrite, 0, + offset, size, numEventsInWaitList, + phEventWaitList, phEvent); +} + +ur_result_t ur_queue_immediate_in_order_t::enqueueRegionCopyUnlocked( + ur_mem_handle_t src, ur_mem_handle_t dst, bool blocking, + ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, + ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, + size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + auto zeParams = ur2zeRegionParams(srcOrigin, dstOrigin, region, srcRowPitch, + dstRowPitch, srcSlicePitch, dstSlicePitch); + + auto handler = getCommandListHandlerForCopy(); + auto signalEvent = getSignalEvent(handler, phEvent); + + auto waitList = + getWaitListView(phEventWaitList, numEventsInWaitList, handler); + + bool memoryMigrated = false; + auto pSrc = ur_cast(src->getDevicePtr( + hDevice, ur_mem_handle_t_::access_mode_t::read_only, 0, src->getSize(), + [&](void *src, void *dst, size_t size) { + ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, + (handler->commandList.get(), dst, src, size, nullptr, + waitList.second, waitList.first)); + memoryMigrated = true; + })); + auto pDst = ur_cast(dst->getDevicePtr( + hDevice, ur_mem_handle_t_::access_mode_t::write_only, 0, dst->getSize(), + [&](void *src, void *dst, size_t size) { + ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, + (handler->commandList.get(), dst, src, size, nullptr, + waitList.second, waitList.first)); + memoryMigrated = true; + })); + + if (memoryMigrated) { + // If memory was migrated, we don't need to pass the wait list to + // the copy command again. + waitList.first = nullptr; + waitList.second = 0; + } + + ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion, + (handler->commandList.get(), pDst, &zeParams.dstRegion, + zeParams.dstPitch, zeParams.dstSlicePitch, pSrc, + &zeParams.srcRegion, zeParams.srcPitch, zeParams.srcSlicePitch, + signalEvent->getZeEvent(), waitList.second, waitList.first)); + + return finalizeHandler(handler, blocking); } ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferReadRect( @@ -164,20 +440,16 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferReadRect( size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = hBuffer; - std::ignore = blockingRead; - std::ignore = bufferOrigin; - std::ignore = hostOrigin; - std::ignore = region; - std::ignore = bufferRowPitch; - std::ignore = bufferSlicePitch; - std::ignore = hostRowPitch; - std::ignore = hostSlicePitch; - std::ignore = pDst; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + TRACK_SCOPE_LATENCY( + "ur_queue_immediate_in_order_t::enqueueMemBufferReadRect"); + + std::scoped_lock Lock(this->Mutex); + + ur_usm_handle_t_ dstHandle(hContext, 0, pDst); + return enqueueRegionCopyUnlocked( + hBuffer, &dstHandle, blockingRead, bufferOrigin, hostOrigin, region, + bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, + numEventsInWaitList, phEventWaitList, phEvent); } ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferWriteRect( @@ -186,35 +458,34 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferWriteRect( size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = hBuffer; - std::ignore = blockingWrite; - std::ignore = bufferOrigin; - std::ignore = hostOrigin; - std::ignore = region; - std::ignore = bufferRowPitch; - std::ignore = bufferSlicePitch; - std::ignore = hostRowPitch; - std::ignore = hostSlicePitch; - std::ignore = pSrc; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + TRACK_SCOPE_LATENCY( + "ur_queue_immediate_in_order_t::enqueueMemBufferWriteRect"); + + std::scoped_lock Lock(this->Mutex); + + ur_usm_handle_t_ srcHandle(hContext, 0, pSrc); + return enqueueRegionCopyUnlocked( + &srcHandle, hBuffer, blockingWrite, hostOrigin, bufferOrigin, region, + hostRowPitch, hostSlicePitch, bufferRowPitch, bufferSlicePitch, + numEventsInWaitList, phEventWaitList, phEvent); } ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferCopy( ur_mem_handle_t hBufferSrc, ur_mem_handle_t hBufferDst, size_t srcOffset, size_t dstOffset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = hBufferSrc; - std::ignore = hBufferDst; - std::ignore = srcOffset; - std::ignore = dstOffset; - std::ignore = size; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemBufferCopy"); + + UR_ASSERT(srcOffset + size <= hBufferSrc->getSize(), + UR_RESULT_ERROR_INVALID_SIZE); + UR_ASSERT(dstOffset + size <= hBufferDst->getSize(), + UR_RESULT_ERROR_INVALID_SIZE); + + std::scoped_lock Lock(this->Mutex); + + return enqueueGenericCopyUnlocked(hBufferSrc, hBufferDst, false, srcOffset, + dstOffset, size, numEventsInWaitList, + phEventWaitList, phEvent); } ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferCopyRect( @@ -223,34 +494,30 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferCopyRect( ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = hBufferSrc; - std::ignore = hBufferDst; - std::ignore = srcOrigin; - std::ignore = dstOrigin; - std::ignore = region; - std::ignore = srcRowPitch; - std::ignore = srcSlicePitch; - std::ignore = dstRowPitch; - std::ignore = dstSlicePitch; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + TRACK_SCOPE_LATENCY( + "ur_queue_immediate_in_order_t::enqueueMemBufferCopyRect"); + + std::scoped_lock Lock(this->Mutex); + + return enqueueRegionCopyUnlocked( + hBufferSrc, hBufferDst, false, srcOrigin, dstOrigin, region, srcRowPitch, + srcSlicePitch, dstRowPitch, dstSlicePitch, numEventsInWaitList, + phEventWaitList, phEvent); } ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferFill( ur_mem_handle_t hBuffer, const void *pPattern, size_t patternSize, size_t offset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = hBuffer; - std::ignore = pPattern; - std::ignore = patternSize; - std::ignore = offset; - std::ignore = size; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemBufferFill"); + + UR_ASSERT(offset + size <= hBuffer->getSize(), UR_RESULT_ERROR_INVALID_SIZE); + + std::scoped_lock Lock(this->Mutex); + + return enqueueGenericFillUnlocked(hBuffer, offset, patternSize, pPattern, + size, numEventsInWaitList, phEventWaitList, + phEvent); } ur_result_t ur_queue_immediate_in_order_t::enqueueMemImageRead( @@ -305,84 +572,225 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemImageCopy( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +static ur_mem_handle_t_::access_mode_t getAccessMode(ur_map_flags_t mapFlags) { + if (mapFlags & UR_MAP_FLAG_WRITE_INVALIDATE_REGION) { + return ur_mem_handle_t_::access_mode_t::write_invalidate; + } else if ((mapFlags & UR_MAP_FLAG_READ) && (mapFlags & UR_MAP_FLAG_WRITE)) { + return ur_mem_handle_t_::access_mode_t::read_write; + } else if (mapFlags & UR_MAP_FLAG_READ) { + return ur_mem_handle_t_::access_mode_t::read_only; + } else if (mapFlags & UR_MAP_FLAG_WRITE) { + return ur_mem_handle_t_::access_mode_t::write_only; + } else { + throw UR_RESULT_ERROR_INVALID_VALUE; + } +} + ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap( ur_mem_handle_t hBuffer, bool blockingMap, ur_map_flags_t mapFlags, size_t offset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, void **ppRetMap) { - std::ignore = hBuffer; - std::ignore = blockingMap; - std::ignore = mapFlags; - std::ignore = offset; - std::ignore = size; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - std::ignore = ppRetMap; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemBufferMap"); + + ur_mem_handle_t_::access_mode_t accessMode = getAccessMode(mapFlags); + + std::scoped_lock Lock(this->Mutex); + + auto handler = getCommandListHandlerForCopy(); + auto signalEvent = getSignalEvent(handler, phEvent); + + auto waitList = + getWaitListView(phEventWaitList, numEventsInWaitList, handler); + + bool memoryMigrated = false; + auto pDst = ur_cast(hBuffer->mapHostPtr( + accessMode, offset, size, [&](void *src, void *dst, size_t size) { + ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, + (handler->commandList.get(), dst, src, size, nullptr, + waitList.second, waitList.first)); + memoryMigrated = true; + })); + *ppRetMap = pDst; + + if (!memoryMigrated && waitList.second) { + // If memory was not migrated, we need to wait on the events here. + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (handler->commandList.get(), waitList.second, waitList.first)); + if (signalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (handler->commandList.get(), signalEvent->getZeEvent())); + } + } + + return finalizeHandler(handler, blockingMap); } ur_result_t ur_queue_immediate_in_order_t::enqueueMemUnmap( ur_mem_handle_t hMem, void *pMappedPtr, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = hMem; - std::ignore = pMappedPtr; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemUnmap"); + + std::scoped_lock Lock(this->Mutex); + + auto handler = getCommandListHandlerForCopy(); + auto signalEvent = getSignalEvent(handler, phEvent); + + auto waitList = + getWaitListView(phEventWaitList, numEventsInWaitList, handler); + + // TODO: currently unmapHostPtr deallocates memory immediately, + // since the memory might be used by the user, we need to make sure + // all dependencies are completed. + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (handler->commandList.get(), waitList.second, waitList.first)); + + bool memoryMigrated = false; + hMem->unmapHostPtr(pMappedPtr, [&](void *src, void *dst, size_t size) { + ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, + (handler->commandList.get(), dst, src, size, nullptr, + waitList.second, waitList.first)); + memoryMigrated = true; + }); + + if (signalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (handler->commandList.get(), signalEvent->getZeEvent())); + } + + return finalizeHandler(handler); +} + +ur_result_t ur_queue_immediate_in_order_t::enqueueGenericFillUnlocked( + ur_mem_handle_t dst, size_t offset, size_t patternSize, + const void *pPattern, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + auto handler = getCommandListHandlerForFill(patternSize); + auto signalEvent = getSignalEvent(handler, phEvent); + + auto waitList = + getWaitListView(phEventWaitList, numEventsInWaitList, handler); + + bool memoryMigrated = false; + auto pDst = ur_cast(dst->getDevicePtr( + hDevice, ur_mem_handle_t_::access_mode_t::read_only, offset, size, + [&](void *src, void *dst, size_t size) { + ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, + (handler->commandList.get(), dst, src, size, nullptr, + waitList.second, waitList.first)); + memoryMigrated = true; + })); + + if (memoryMigrated) { + // If memory was migrated, we don't need to pass the wait list to + // the copy command again. + waitList.first = nullptr; + waitList.second = 0; + } + + // TODO: support non-power-of-two pattern sizes + + // PatternSize must be a power of two for zeCommandListAppendMemoryFill. + // When it's not, the fill is emulated with zeCommandListAppendMemoryCopy. + ZE2UR_CALL(zeCommandListAppendMemoryFill, + (handler->commandList.get(), pDst, pPattern, patternSize, size, + signalEvent->getZeEvent(), waitList.second, waitList.first)); + + return finalizeHandler(handler); } ur_result_t ur_queue_immediate_in_order_t::enqueueUSMFill( void *pMem, size_t patternSize, const void *pPattern, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = pMem; - std::ignore = patternSize; - std::ignore = pPattern; - std::ignore = size; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMFill"); + + std::scoped_lock Lock(this->Mutex); + + ur_usm_handle_t_ dstHandle(hContext, size, pMem); + return enqueueGenericFillUnlocked(&dstHandle, 0, patternSize, pPattern, size, + numEventsInWaitList, phEventWaitList, + phEvent); } ur_result_t ur_queue_immediate_in_order_t::enqueueUSMMemcpy( bool blocking, void *pDst, const void *pSrc, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = blocking; - std::ignore = pDst; - std::ignore = pSrc; - std::ignore = size; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + // TODO: parametrize latency tracking with 'blocking' + TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMMemcpy"); + + std::scoped_lock Lock(this->Mutex); + + auto handler = getCommandListHandlerForCopy(); + auto signalEvent = getSignalEvent(handler, phEvent); + + auto [pWaitEvents, numWaitEvents] = + getWaitListView(phEventWaitList, numEventsInWaitList, handler); + + ZE2UR_CALL(zeCommandListAppendMemoryCopy, + (handler->commandList.get(), pDst, pSrc, size, + signalEvent->getZeEvent(), numWaitEvents, pWaitEvents)); + + return finalizeHandler(handler, blocking); } ur_result_t ur_queue_immediate_in_order_t::enqueueUSMPrefetch( const void *pMem, size_t size, ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = pMem; - std::ignore = size; std::ignore = flags; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + + std::scoped_lock Lock(this->Mutex); + + auto handler = getCommandListHandlerForCompute(); + auto signalEvent = getSignalEvent(handler, phEvent); + + auto [pWaitEvents, numWaitEvents] = + getWaitListView(phEventWaitList, numEventsInWaitList, handler); + + if (pWaitEvents) { + ZE2UR_CALL(zeCommandListAppendBarrier, (handler->commandList.get(), nullptr, + numWaitEvents, pWaitEvents)); + } + // TODO: figure out how to translate "flags" + ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, + (handler->commandList.get(), pMem, size)); + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (handler->commandList.get(), signalEvent->getZeEvent())); + + return finalizeHandler(handler); } ur_result_t ur_queue_immediate_in_order_t::enqueueUSMAdvise(const void *pMem, size_t size, ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) { - std::ignore = pMem; - std::ignore = size; - std::ignore = advice; - std::ignore = phEvent; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + std::ignore = flags; + + auto zeAdvice = ur_cast(advice); + + std::scoped_lock Lock(this->Mutex); + + auto handler = getCommandListHandlerForCompute(); + auto signalEvent = getSignalEvent(handler, phEvent); + + auto [pWaitEvents, numWaitEvents] = getWaitListView(nullptr, 0, handler); + + if (pWaitEvents) { + ZE2UR_CALL(zeCommandListAppendBarrier, (handler->commandList.get(), nullptr, + numWaitEvents, pWaitEvents)); + } + + // TODO: figure out how to translate "flags" + ZE2UR_CALL(zeCommandListAppendMemAdvise, + (handler->commandList.get(), this->hDevice->ZeDevice, pMem, size, + zeAdvice)); + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (handler->commandList.get(), signalEvent->getZeEvent())); + + return finalizeHandler(handler); } ur_result_t ur_queue_immediate_in_order_t::enqueueUSMFill2D( @@ -418,36 +826,65 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMMemcpy2D( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +static void *getGlobalPointerFromModule(ze_module_handle_t hModule, + size_t offset, size_t count, + const char *name) { + // Find global variable pointer + size_t globalVarSize = 0; + void *globalVarPtr = nullptr; + ZE2UR_CALL_THROWS(zeModuleGetGlobalPointer, + (hModule, name, &globalVarSize, &globalVarPtr)); + if (globalVarSize < offset + count) { + setErrorMessage("Write device global variable is out of range.", + UR_RESULT_ERROR_INVALID_VALUE, + static_cast(ZE_RESULT_ERROR_INVALID_ARGUMENT)); + throw UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + return globalVarPtr; +} + ur_result_t ur_queue_immediate_in_order_t::enqueueDeviceGlobalVariableWrite( ur_program_handle_t hProgram, const char *name, bool blockingWrite, size_t count, size_t offset, const void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = hProgram; - std::ignore = name; - std::ignore = blockingWrite; - std::ignore = count; - std::ignore = offset; - std::ignore = pSrc; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + // TODO: implement program->getZeModuleMap() to be sure that + // it's thread-safe + ze_module_handle_t zeModule{}; + auto It = hProgram->ZeModuleMap.find(this->hDevice->ZeDevice); + if (It != hProgram->ZeModuleMap.end()) { + zeModule = It->second; + } else { + zeModule = hProgram->ZeModule; + } + + // Find global variable pointer + auto globalVarPtr = getGlobalPointerFromModule(zeModule, offset, count, name); + + return enqueueUSMMemcpy(blockingWrite, ur_cast(globalVarPtr) + offset, + pSrc, count, numEventsInWaitList, phEventWaitList, + phEvent); } ur_result_t ur_queue_immediate_in_order_t::enqueueDeviceGlobalVariableRead( ur_program_handle_t hProgram, const char *name, bool blockingRead, size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = hProgram; - std::ignore = name; - std::ignore = blockingRead; - std::ignore = count; - std::ignore = offset; - std::ignore = pDst; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + // TODO: implement program->getZeModule() to be sure that + // it's thread-safe + ze_module_handle_t zeModule{}; + auto It = hProgram->ZeModuleMap.find(this->hDevice->ZeDevice); + if (It != hProgram->ZeModuleMap.end()) { + zeModule = It->second; + } else { + zeModule = hProgram->ZeModule; + } + + // Find global variable pointer + auto globalVarPtr = getGlobalPointerFromModule(zeModule, offset, count, name); + + return enqueueUSMMemcpy(blockingRead, pDst, + ur_cast(globalVarPtr) + offset, count, + numEventsInWaitList, phEventWaitList, phEvent); } ur_result_t ur_queue_immediate_in_order_t::enqueueReadHostPipe( @@ -504,7 +941,7 @@ ur_result_t ur_queue_immediate_in_order_t::bindlessImagesImageCopyExp( ur_result_t ur_queue_immediate_in_order_t::bindlessImagesWaitExternalSemaphoreExp( - ur_exp_interop_semaphore_handle_t hSemaphore, bool hasWaitValue, + ur_exp_external_semaphore_handle_t hSemaphore, bool hasWaitValue, uint64_t waitValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { std::ignore = hSemaphore; @@ -518,7 +955,7 @@ ur_queue_immediate_in_order_t::bindlessImagesWaitExternalSemaphoreExp( ur_result_t ur_queue_immediate_in_order_t::bindlessImagesSignalExternalSemaphoreExp( - ur_exp_interop_semaphore_handle_t hSemaphore, bool hasSignalValue, + ur_exp_external_semaphore_handle_t hSemaphore, bool hasSignalValue, uint64_t signalValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { std::ignore = hSemaphore; @@ -549,11 +986,26 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp( ur_result_t ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp( bool blocking, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = blocking; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + std::scoped_lock lock(this->Mutex); + + auto handler = getCommandListHandlerForCompute(); + auto signalEvent = getSignalEvent(handler, phEvent); + + if (!signalEvent) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + auto [pWaitEvents, numWaitEvents] = + getWaitListView(phEventWaitList, numEventsInWaitList, handler); + + signalEvent->recordStartTimestamp(); + + ZE2UR_CALL(zeCommandListAppendWriteGlobalTimestamp, + (handler->commandList.get(), + signalEvent->getEventEndTimestampPtr(), signalEvent->getZeEvent(), + numWaitEvents, pWaitEvents)); + + return finalizeHandler(handler, blocking); } ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunchCustomExp( diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index e55f274148..e56950c4e2 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -10,8 +10,12 @@ #pragma once #include "../common.hpp" -#include "../queue.hpp" +#include "../device.hpp" + #include "context.hpp" +#include "event.hpp" +#include "event_pool_cache.hpp" +#include "queue_api.hpp" #include "ur/ur.hpp" @@ -20,22 +24,71 @@ namespace v2 { using queue_group_type = ur_device_handle_t_::queue_group_info_t::type; struct ur_command_list_handler_t { - ur_command_list_handler_t(v2::ur_context_handle_t hContext, + ur_command_list_handler_t(ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_queue_properties_t *pProps, - queue_group_type type); + queue_group_type type, event_pool *eventPool); raii::cache_borrowed_command_list_t commandList; + std::unique_ptr> + internalEvent; + + // TODO: do we need to keep ref count of this for user events? + // For counter based events, we can reuse them safely and l0 event pool + // cannot be destroyed before the queue is released. + ur_event_handle_t lastEvent = nullptr; }; struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { private: + ur_context_handle_t hContext; + ur_device_handle_t hDevice; + ur_queue_flags_t flags; + + raii::cache_borrowed_event_pool eventPool; + ur_command_list_handler_t copyHandler; ur_command_list_handler_t computeHandler; + ur_command_list_handler_t *lastHandler = nullptr; + + std::vector waitList; + + std::pair + getWaitListView(const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents, + ur_command_list_handler_t *pHandler); + + ur_command_list_handler_t *getCommandListHandlerForCompute(); + ur_command_list_handler_t *getCommandListHandlerForCopy(); + ur_command_list_handler_t *getCommandListHandlerForFill(size_t patternSize); + + ur_event_handle_t getSignalEvent(ur_command_list_handler_t *handler, + ur_event_handle_t *hUserEvent); + + ur_result_t finalizeHandler(ur_command_list_handler_t *handler); + ur_result_t finalizeHandler(ur_command_list_handler_t *handler, + bool blocking); + + ur_result_t enqueueRegionCopyUnlocked( + ur_mem_handle_t src, ur_mem_handle_t dst, bool blocking, + ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, + ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, + size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); + + ur_result_t enqueueGenericCopyUnlocked( + ur_mem_handle_t src, ur_mem_handle_t dst, bool blocking, size_t srcOffset, + size_t dstOffset, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); + + ur_result_t enqueueGenericFillUnlocked( + ur_mem_handle_t hBuffer, size_t offset, size_t patternSize, + const void *pPattern, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); public: - ur_queue_immediate_in_order_t(v2::ur_context_handle_t, ur_device_handle_t, + ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t, const ur_queue_properties_t *); + ~ur_queue_immediate_in_order_t() {} ur_result_t queueGetInfo(ur_queue_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet) override; @@ -189,12 +242,12 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override; ur_result_t bindlessImagesWaitExternalSemaphoreExp( - ur_exp_interop_semaphore_handle_t hSemaphore, bool hasWaitValue, + ur_exp_external_semaphore_handle_t hSemaphore, bool hasWaitValue, uint64_t waitValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override; ur_result_t bindlessImagesSignalExternalSemaphoreExp( - ur_exp_interop_semaphore_handle_t hSemaphore, bool hasSignalValue, + ur_exp_external_semaphore_handle_t hSemaphore, bool hasSignalValue, uint64_t signalValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override; diff --git a/source/adapters/level_zero/v2/usm.cpp b/source/adapters/level_zero/v2/usm.cpp new file mode 100644 index 0000000000..f23a6c6fe8 --- /dev/null +++ b/source/adapters/level_zero/v2/usm.cpp @@ -0,0 +1,383 @@ +//===--------- usm.cpp - Level Zero Adapter ------------------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ur_api.h" + +#include "../device.hpp" +#include "context.hpp" +#include "umf_pools/disjoint_pool_config_parser.hpp" +#include "usm.hpp" + +#include +#include +#include + +namespace umf { +ur_result_t getProviderNativeError(const char *providerName, + int32_t nativeError) { + if (strcmp(providerName, "Level Zero") == 0) { + return ze2urResult(static_cast(nativeError)); + } + + return UR_RESULT_ERROR_UNKNOWN; +} +} // namespace umf + +static usm::DisjointPoolAllConfigs initializeDisjointPoolConfig() { + const char *PoolUrTraceVal = std::getenv("UR_L0_USM_ALLOCATOR_TRACE"); + + int PoolTrace = 0; + if (PoolUrTraceVal != nullptr) { + PoolTrace = std::atoi(PoolUrTraceVal); + } + + const char *PoolUrConfigVal = std::getenv("UR_L0_USM_ALLOCATOR"); + if (PoolUrConfigVal == nullptr) { + return usm::DisjointPoolAllConfigs(PoolTrace); + } + + return usm::parseDisjointPoolConfig(PoolUrConfigVal, PoolTrace); +} + +inline umf_usm_memory_type_t urToUmfMemoryType(ur_usm_type_t type) { + switch (type) { + case UR_USM_TYPE_DEVICE: + return UMF_MEMORY_TYPE_DEVICE; + case UR_USM_TYPE_SHARED: + return UMF_MEMORY_TYPE_SHARED; + case UR_USM_TYPE_HOST: + return UMF_MEMORY_TYPE_HOST; + default: + throw UR_RESULT_ERROR_INVALID_ARGUMENT; + } +} + +static usm::DisjointPoolMemType +descToDisjoinPoolMemType(const usm::pool_descriptor &desc) { + switch (desc.type) { + case UR_USM_TYPE_DEVICE: + return usm::DisjointPoolMemType::Device; + case UR_USM_TYPE_SHARED: { + if (desc.deviceReadOnly) + return usm::DisjointPoolMemType::SharedReadOnly; + else + return usm::DisjointPoolMemType::Shared; + } + case UR_USM_TYPE_HOST: + return usm::DisjointPoolMemType::Host; + default: + throw UR_RESULT_ERROR_INVALID_ARGUMENT; + } +} + +static umf::pool_unique_handle_t +makePool(umf_disjoint_pool_params_t *poolParams, + usm::pool_descriptor poolDescriptor) { + level_zero_memory_provider_params_t params = {}; + params.level_zero_context_handle = poolDescriptor.hContext->getZeHandle(); + params.level_zero_device_handle = + poolDescriptor.hDevice ? poolDescriptor.hDevice->ZeDevice : nullptr; + params.memory_type = urToUmfMemoryType(poolDescriptor.type); + + std::vector residentZeHandles; + + if (poolDescriptor.type == UR_USM_TYPE_DEVICE) { + assert(params.level_zero_device_handle); + auto residentHandles = + poolDescriptor.hContext->getP2PDevices(poolDescriptor.hDevice); + residentZeHandles.push_back(params.level_zero_device_handle); + for (auto &device : residentHandles) { + residentZeHandles.push_back(device->ZeDevice); + } + + params.resident_device_handles = residentZeHandles.data(); + params.resident_device_count = residentZeHandles.size(); + } + + auto [ret, provider] = + umf::providerMakeUniqueFromOps(umfLevelZeroMemoryProviderOps(), ¶ms); + if (ret != UMF_RESULT_SUCCESS) { + throw umf::umf2urResult(ret); + } + + if (!poolParams) { + auto [ret, poolHandle] = umf::poolMakeUniqueFromOps( + umfProxyPoolOps(), std::move(provider), nullptr); + if (ret != UMF_RESULT_SUCCESS) + throw umf::umf2urResult(ret); + return std::move(poolHandle); + } else { + auto [ret, poolHandle] = + umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(provider), + static_cast(poolParams)); + if (ret != UMF_RESULT_SUCCESS) + throw umf::umf2urResult(ret); + return std::move(poolHandle); + } +} + +ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t hContext, + ur_usm_pool_desc_t *pPoolDesc) + : hContext(hContext) { + // TODO: handle UR_USM_POOL_FLAG_ZERO_INITIALIZE_BLOCK from pPoolDesc + auto disjointPoolConfigs = initializeDisjointPoolConfig(); + if (auto limits = find_stype_node(pPoolDesc)) { + for (auto &config : disjointPoolConfigs.Configs) { + config.MaxPoolableSize = limits->maxPoolableSize; + config.SlabMinSize = limits->minDriverAllocSize; + } + } + + auto [result, descriptors] = usm::pool_descriptor::create(this, hContext); + if (result != UR_RESULT_SUCCESS) { + throw result; + } + + for (auto &desc : descriptors) { + if (disjointPoolConfigs.EnableBuffers) { + auto &poolConfig = + disjointPoolConfigs.Configs[descToDisjoinPoolMemType(desc)]; + poolManager.addPool(desc, makePool(&poolConfig, desc)); + } else { + poolManager.addPool(desc, makePool(nullptr, desc)); + } + } +} + +ur_context_handle_t ur_usm_pool_handle_t_::getContextHandle() const { + return hContext; +} + +umf_memory_pool_handle_t +ur_usm_pool_handle_t_::getPool(const usm::pool_descriptor &desc) { + auto pool = poolManager.getPool(desc).value(); + assert(pool); + return pool; +} + +ur_result_t ur_usm_pool_handle_t_::allocate( + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_device_handle_t hDevice, ///< [in] handle of the device object + const ur_usm_desc_t *pUSMDesc, ur_usm_type_t type, size_t size, + void **ppRetMem) { + uint32_t alignment = pUSMDesc ? pUSMDesc->align : 0; + + auto umfPool = + getPool(usm::pool_descriptor{this, hContext, hDevice, type, false}); + if (!umfPool) { + return UR_RESULT_ERROR_INVALID_ARGUMENT; + } + + *ppRetMem = umfPoolAlignedMalloc(umfPool, size, alignment); + if (*ppRetMem == nullptr) { + auto umfRet = umfPoolGetLastAllocationError(umfPool); + return umf::umf2urResult(umfRet); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_usm_pool_handle_t_::free(void *ptr) { + return umf::umf2urResult(umfFree(ptr)); +} + +namespace ur::level_zero { +ur_result_t urUSMPoolCreate( + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_usm_pool_desc_t * + pPoolDesc, ///< [in] pointer to USM pool descriptor. Can be chained with + ///< ::ur_usm_pool_limits_desc_t + ur_usm_pool_handle_t *hPool ///< [out] pointer to USM memory pool +) { + + *hPool = new ur_usm_pool_handle_t_(hContext, pPoolDesc); + return UR_RESULT_SUCCESS; +} + +ur_result_t +urUSMPoolRetain(ur_usm_pool_handle_t hPool ///< [in] pointer to USM memory pool +) { + hPool->RefCount.increment(); + return UR_RESULT_SUCCESS; +} + +ur_result_t +urUSMPoolRelease(ur_usm_pool_handle_t hPool ///< [in] pointer to USM memory pool +) { + if (hPool->RefCount.decrementAndTest()) { + delete hPool; + } + return UR_RESULT_SUCCESS; +} + +ur_result_t urUSMPoolGetInfo( + ur_usm_pool_handle_t hPool, ///< [in] handle of the USM memory pool + ur_usm_pool_info_t propName, ///< [in] name of the pool property to query + size_t propSize, ///< [in] size in bytes of the pool property value provided + void *pPropValue, ///< [out][typename(propName, propSize)] value of the pool + ///< property + size_t + *pPropSizeRet ///< [out] size in bytes returned in pool property value +) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_USM_POOL_INFO_REFERENCE_COUNT: { + return ReturnValue(hPool->RefCount.load()); + } + case UR_USM_POOL_INFO_CONTEXT: { + return ReturnValue(hPool->getContextHandle()); + } + default: { + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + } + } +} + +ur_result_t urUSMDeviceAlloc( + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_device_handle_t hDevice, ///< [in] handle of the device object + const ur_usm_desc_t + *pUSMDesc, ///< [in][optional] USM memory allocation descriptor + ur_usm_pool_handle_t hPool, ///< [in][optional] Pointer to a pool created + ///< using urUSMPoolCreate + size_t + size, ///< [in] size in bytes of the USM memory object to be allocated + void **ppRetMem ///< [out] pointer to USM device memory object +) { + if (!hPool) { + hPool = hContext->getDefaultUSMPool(); + } + + return hPool->allocate(hContext, hDevice, pUSMDesc, UR_USM_TYPE_DEVICE, size, + ppRetMem); +} + +ur_result_t urUSMSharedAlloc( + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_device_handle_t hDevice, ///< [in] handle of the device object + const ur_usm_desc_t + *pUSMDesc, ///< [in][optional] USM memory allocation descriptor + ur_usm_pool_handle_t hPool, ///< [in][optional] Pointer to a pool created + ///< using urUSMPoolCreate + size_t + size, ///< [in] size in bytes of the USM memory object to be allocated + void **ppRetMem ///< [out] pointer to USM shared memory object +) { + if (!hPool) { + hPool = hContext->getDefaultUSMPool(); + } + + return hPool->allocate(hContext, hDevice, pUSMDesc, UR_USM_TYPE_SHARED, size, + ppRetMem); +} + +ur_result_t urUSMHostAlloc( + ur_context_handle_t hContext, ///< [in] handle of the context object + const ur_usm_desc_t + *pUSMDesc, ///< [in][optional] USM memory allocation descriptor + ur_usm_pool_handle_t hPool, ///< [in][optional] Pointer to a pool created + ///< using urUSMPoolCreate + size_t + size, ///< [in] size in bytes of the USM memory object to be allocated + void **ppRetMem ///< [out] pointer to USM host memory object +) { + if (!hPool) { + hPool = hContext->getDefaultUSMPool(); + } + + return hPool->allocate(hContext, nullptr, pUSMDesc, UR_USM_TYPE_HOST, size, + ppRetMem); +} + +ur_result_t +urUSMFree(ur_context_handle_t hContext, ///< [in] handle of the context object + void *pMem ///< [in] pointer to USM memory object +) { + std::ignore = hContext; + return umf::umf2urResult(umfFree(pMem)); +} + +ur_result_t urUSMGetMemAllocInfo( + ur_context_handle_t hContext, ///< [in] handle of the context object + const void *ptr, ///< [in] pointer to USM memory object + ur_usm_alloc_info_t + propName, ///< [in] the name of the USM allocation property to query + size_t propValueSize, ///< [in] size in bytes of the USM allocation property + ///< value + void *pPropValue, ///< [out][optional] value of the USM allocation property + size_t *pPropValueSizeRet ///< [out][optional] bytes returned in USM + ///< allocation property +) { + ze_device_handle_t zeDeviceHandle; + ZeStruct zeMemoryAllocationProperties; + + // TODO: implement this using UMF once + // https://github.com/oneapi-src/unified-memory-framework/issues/686 + // https://github.com/oneapi-src/unified-memory-framework/issues/687 + // are implemented + ZE2UR_CALL(zeMemGetAllocProperties, + (hContext->getZeHandle(), ptr, &zeMemoryAllocationProperties, + &zeDeviceHandle)); + + UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); + switch (propName) { + case UR_USM_ALLOC_INFO_TYPE: { + ur_usm_type_t memAllocType; + switch (zeMemoryAllocationProperties.type) { + case ZE_MEMORY_TYPE_UNKNOWN: + memAllocType = UR_USM_TYPE_UNKNOWN; + break; + case ZE_MEMORY_TYPE_HOST: + memAllocType = UR_USM_TYPE_HOST; + break; + case ZE_MEMORY_TYPE_DEVICE: + memAllocType = UR_USM_TYPE_DEVICE; + break; + case ZE_MEMORY_TYPE_SHARED: + memAllocType = UR_USM_TYPE_SHARED; + break; + default: + logger::error("urUSMGetMemAllocInfo: unexpected usm memory type"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + return ReturnValue(memAllocType); + } + case UR_USM_ALLOC_INFO_DEVICE: + if (zeDeviceHandle) { + auto Platform = hContext->getPlatform(); + auto Device = Platform->getDeviceFromNativeHandle(zeDeviceHandle); + return Device ? ReturnValue(Device) : UR_RESULT_ERROR_INVALID_VALUE; + } else { + return UR_RESULT_ERROR_INVALID_VALUE; + } + case UR_USM_ALLOC_INFO_BASE_PTR: { + void *base; + ZE2UR_CALL(zeMemGetAddressRange, + (hContext->getZeHandle(), ptr, &base, nullptr)); + return ReturnValue(base); + } + case UR_USM_ALLOC_INFO_SIZE: { + size_t size; + ZE2UR_CALL(zeMemGetAddressRange, + (hContext->getZeHandle(), ptr, nullptr, &size)); + return ReturnValue(size); + } + case UR_USM_ALLOC_INFO_POOL: { + // TODO + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + default: + logger::error("urUSMGetMemAllocInfo: unsupported ParamName"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + } + return UR_RESULT_SUCCESS; +} +} // namespace ur::level_zero diff --git a/source/adapters/level_zero/v2/usm.hpp b/source/adapters/level_zero/v2/usm.hpp new file mode 100644 index 0000000000..5d4170bd6b --- /dev/null +++ b/source/adapters/level_zero/v2/usm.hpp @@ -0,0 +1,34 @@ +//===--------- usm.cpp - Level Zero Adapter ------------------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "ur_api.h" + +#include "common.hpp" +#include "ur_pool_manager.hpp" + +struct ur_usm_pool_handle_t_ : _ur_object { + ur_usm_pool_handle_t_(ur_context_handle_t hContext, + ur_usm_pool_desc_t *pPoolDes); + + ur_context_handle_t getContextHandle() const; + + ur_result_t allocate(ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_usm_desc_t *pUSMDesc, ur_usm_type_t type, + size_t size, void **ppRetMem); + ur_result_t free(void *ptr); + +private: + ur_context_handle_t hContext; + usm::pool_manager poolManager; + + umf_memory_pool_handle_t getPool(const usm::pool_descriptor &desc); +}; diff --git a/source/adapters/level_zero/virtual_mem.cpp b/source/adapters/level_zero/virtual_mem.cpp index e3b90121a1..e89899ded7 100644 --- a/source/adapters/level_zero/virtual_mem.cpp +++ b/source/adapters/level_zero/virtual_mem.cpp @@ -15,7 +15,9 @@ #include "physical_mem.hpp" #include "ur_level_zero.hpp" -UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( +namespace ur::level_zero { + +ur_result_t urVirtualMemGranularityGetInfo( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_virtual_mem_granularity_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet) { @@ -39,24 +41,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL -urVirtualMemReserve(ur_context_handle_t hContext, const void *pStart, - size_t size, void **ppStart) { +ur_result_t urVirtualMemReserve(ur_context_handle_t hContext, + const void *pStart, size_t size, + void **ppStart) { ZE2UR_CALL(zeVirtualMemReserve, (hContext->ZeContext, pStart, size, ppStart)); return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemFree( - ur_context_handle_t hContext, const void *pStart, size_t size) { +ur_result_t urVirtualMemFree(ur_context_handle_t hContext, const void *pStart, + size_t size) { ZE2UR_CALL(zeVirtualMemFree, (hContext->ZeContext, pStart, size)); return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL -urVirtualMemSetAccess(ur_context_handle_t hContext, const void *pStart, - size_t size, ur_virtual_mem_access_flags_t flags) { +ur_result_t urVirtualMemSetAccess(ur_context_handle_t hContext, + const void *pStart, size_t size, + ur_virtual_mem_access_flags_t flags) { ze_memory_access_attribute_t AccessAttr = ZE_MEMORY_ACCESS_ATTRIBUTE_NONE; if (flags & UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE) AccessAttr = ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE; @@ -69,10 +71,10 @@ urVirtualMemSetAccess(ur_context_handle_t hContext, const void *pStart, return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL -urVirtualMemMap(ur_context_handle_t hContext, const void *pStart, size_t size, - ur_physical_mem_handle_t hPhysicalMem, size_t offset, - ur_virtual_mem_access_flags_t flags) { +ur_result_t urVirtualMemMap(ur_context_handle_t hContext, const void *pStart, + size_t size, ur_physical_mem_handle_t hPhysicalMem, + size_t offset, + ur_virtual_mem_access_flags_t flags) { ze_memory_access_attribute_t AccessAttr = ZE_MEMORY_ACCESS_ATTRIBUTE_NONE; if (flags & UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE) AccessAttr = ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE; @@ -86,17 +88,18 @@ urVirtualMemMap(ur_context_handle_t hContext, const void *pStart, size_t size, return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemUnmap( - ur_context_handle_t hContext, const void *pStart, size_t size) { +ur_result_t urVirtualMemUnmap(ur_context_handle_t hContext, const void *pStart, + size_t size) { ZE2UR_CALL(zeVirtualMemUnmap, (hContext->ZeContext, pStart, size)); return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGetInfo( - ur_context_handle_t hContext, const void *pStart, - [[maybe_unused]] size_t size, ur_virtual_mem_info_t propName, - size_t propSize, void *pPropValue, size_t *pPropSizeRet) { +ur_result_t urVirtualMemGetInfo(ur_context_handle_t hContext, + const void *pStart, + [[maybe_unused]] size_t size, + ur_virtual_mem_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) { UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); switch (propName) { case UR_VIRTUAL_MEM_INFO_ACCESS_MODE: { @@ -119,3 +122,4 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGetInfo( return UR_RESULT_SUCCESS; } +} // namespace ur::level_zero diff --git a/source/adapters/mock/ur_mock.cpp b/source/adapters/mock/ur_mock.cpp index b1fc9c8c29..c72c1e30ed 100644 --- a/source/adapters/mock/ur_mock.cpp +++ b/source/adapters/mock/ur_mock.cpp @@ -17,13 +17,14 @@ namespace driver { context_t d_context; ur_result_t mock_urPlatformGetApiVersion(void *pParams) { - auto params = *static_cast(pParams); + const auto ¶ms = + *static_cast(pParams); **params.ppVersion = d_context.version; return UR_RESULT_SUCCESS; } ur_result_t mock_urPlatformGetInfo(void *pParams) { - auto params = *static_cast(pParams); + const auto ¶ms = *static_cast(pParams); if (!*params.phPlatform) { return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } @@ -49,7 +50,7 @@ ur_result_t mock_urPlatformGetInfo(void *pParams) { ////////////////////////////////////////////////////////////////////////// ur_result_t mock_urDeviceGetInfo(void *pParams) { - auto params = *static_cast(pParams); + const auto ¶ms = *static_cast(pParams); switch (*params.ppropName) { case UR_DEVICE_INFO_TYPE: if (*params.ppPropValue != nullptr) { diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp index 237415d867..f5e040d508 100644 --- a/source/adapters/mock/ur_mockddi.cpp +++ b/source/adapters/mock/ur_mockddi.cpp @@ -24,7 +24,7 @@ __urdlllocal ur_result_t UR_APICALL urAdapterGet( ur_adapter_handle_t * phAdapters, ///< [out][optional][range(0, NumEntries)] array of handle of adapters. ///< If NumEntries is less than the number of adapters available, then - ///< ::urAdapterGet shall only retrieve that number of platforms. + ///< ::urAdapterGet shall only retrieve that number of adapters. uint32_t * pNumAdapters ///< [out][optional] returns the total number of adapters available. ) try { @@ -921,7 +921,8 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetNativeHandle( __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_native_handle_t hNativeDevice, ///< [in][nocheck] the native handle of the device. - ur_platform_handle_t hPlatform, ///< [in] handle of the platform instance + ur_adapter_handle_t + hAdapter, ///< [in] handle of the adapter to which `hNativeDevice` belongs const ur_device_native_properties_t * pProperties, ///< [in][optional] pointer to native device properties struct. ur_device_handle_t @@ -930,7 +931,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_result_t result = UR_RESULT_SUCCESS; ur_device_create_with_native_handle_params_t params = { - &hNativeDevice, &hPlatform, &pProperties, &phDevice}; + &hNativeDevice, &hAdapter, &pProperties, &phDevice}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( @@ -3864,6 +3865,7 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgValue( *pProperties, ///< [in][optional] pointer to value properties. const void *pArgValue ///< [in] argument value represented as matching arg type. + ///< The data pointed to will be copied and therefore can be reused on return. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -5375,17 +5377,16 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that ///< specify the number of local work-items forming a work-group that will ///< execute the kernel function. - ///< If nullptr, the runtime implementation will choose the work-group - ///< size. + ///< If nullptr, the runtime implementation will choose the work-group size. uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -5449,7 +5450,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWait( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -5506,7 +5508,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -5571,7 +5574,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferRead( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -5637,7 +5641,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWrite( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -5713,7 +5718,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferReadRect( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -5803,7 +5809,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -5880,7 +5887,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopy( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -5953,7 +5961,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -6019,7 +6028,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferFill( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -6093,7 +6103,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageRead( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -6164,7 +6175,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageWrite( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -6236,7 +6248,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageCopy( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -6299,7 +6312,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferMap( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent, ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. void **ppRetMap ///< [out] return mapped pointer. TODO: move it before ///< numEventsInWaitList? ) try { @@ -6365,7 +6379,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemUnmap( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -6430,7 +6445,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -6494,7 +6510,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -6555,7 +6572,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMPrefetch( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -6675,11 +6693,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill2D( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -6745,11 +6763,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -6813,11 +6831,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -6883,11 +6901,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -6959,9 +6977,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueReadHostPipe( ///< events that must be complete before the host pipe read. ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * - phEvent ///< [out][optional] returns an event object that identifies this read - ///< command + phEvent ///< [out][optional] returns an event object that identifies this read command ///< and can be used to query or queue a wait for this command to complete. + ///< If phEventWaitList and phEvent are not NULL, phEvent must not refer to + ///< an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -7033,6 +7052,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueWriteHostPipe( ur_event_handle_t * phEvent ///< [out][optional] returns an event object that identifies this write command ///< and can be used to query or queue a wait for this command to complete. + ///< If phEventWaitList and phEvent are not NULL, phEvent must not refer to + ///< an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -7479,7 +7500,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageCopyExp( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -7699,16 +7721,16 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( size_t size, ///< [in] size of the external memory ur_exp_external_mem_type_t memHandleType, ///< [in] type of external memory handle - ur_exp_interop_mem_desc_t - *pInteropMemDesc, ///< [in] the interop memory descriptor - ur_exp_interop_mem_handle_t - *phInteropMem ///< [out] interop memory handle to the external memory + ur_exp_external_mem_desc_t + *pExternalMemDesc, ///< [in] the external memory descriptor + ur_exp_external_mem_handle_t + *phExternalMem ///< [out] external memory handle to the external memory ) try { ur_result_t result = UR_RESULT_SUCCESS; ur_bindless_images_import_external_memory_exp_params_t params = { - &hContext, &hDevice, &size, - &memHandleType, &pInteropMemDesc, &phInteropMem}; + &hContext, &hDevice, &size, + &memHandleType, &pExternalMemDesc, &phExternalMem}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( @@ -7727,7 +7749,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( result = replaceCallback(¶ms); } else { - *phInteropMem = mock::createDummyHandle(); + *phExternalMem = + mock::createDummyHandle(); result = UR_RESULT_SUCCESS; } @@ -7755,16 +7778,16 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( const ur_image_format_t *pImageFormat, ///< [in] pointer to image format specification const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description - ur_exp_interop_mem_handle_t - hInteropMem, ///< [in] interop memory handle to the external memory + ur_exp_external_mem_handle_t + hExternalMem, ///< [in] external memory handle to the external memory ur_exp_image_mem_native_handle_t * phImageMem ///< [out] image memory handle to the externally allocated memory ) try { ur_result_t result = UR_RESULT_SUCCESS; ur_bindless_images_map_external_array_exp_params_t params = { - &hContext, &hDevice, &pImageFormat, - &pImageDesc, &hInteropMem, &phImageMem}; + &hContext, &hDevice, &pImageFormat, + &pImageDesc, &hExternalMem, &phImageMem}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( @@ -7805,21 +7828,24 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( } /////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urBindlessImagesReleaseInteropExp -__urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( +/// @brief Intercept function for urBindlessImagesMapExternalLinearMemoryExp +__urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp( ur_context_handle_t hContext, ///< [in] handle of the context object ur_device_handle_t hDevice, ///< [in] handle of the device object - ur_exp_interop_mem_handle_t - hInteropMem ///< [in][release] handle of interop memory to be destroyed + uint64_t offset, ///< [in] offset into memory region to map + uint64_t size, ///< [in] size of memory region to map + ur_exp_external_mem_handle_t + hExternalMem, ///< [in] external memory handle to the external memory + void **ppRetMem ///< [out] pointer of the externally allocated memory ) try { ur_result_t result = UR_RESULT_SUCCESS; - ur_bindless_images_release_interop_exp_params_t params = { - &hContext, &hDevice, &hInteropMem}; + ur_bindless_images_map_external_linear_memory_exp_params_t params = { + &hContext, &hDevice, &offset, &size, &hExternalMem, &ppRetMem}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( - "urBindlessImagesReleaseInteropExp")); + "urBindlessImagesMapExternalLinearMemoryExp")); if (beforeCallback) { result = beforeCallback(¶ms); if (result != UR_RESULT_SUCCESS) { @@ -7829,12 +7855,11 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( auto replaceCallback = reinterpret_cast( mock::getCallbacks().get_replace_callback( - "urBindlessImagesReleaseInteropExp")); + "urBindlessImagesMapExternalLinearMemoryExp")); if (replaceCallback) { result = replaceCallback(¶ms); } else { - mock::releaseDummyHandle(hInteropMem); result = UR_RESULT_SUCCESS; } @@ -7844,7 +7869,57 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( auto afterCallback = reinterpret_cast( mock::getCallbacks().get_after_callback( - "urBindlessImagesReleaseInteropExp")); + "urBindlessImagesMapExternalLinearMemoryExp")); + if (afterCallback) { + return afterCallback(¶ms); + } + + return result; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urBindlessImagesReleaseExternalMemoryExp +__urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp( + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_device_handle_t hDevice, ///< [in] handle of the device object + ur_exp_external_mem_handle_t + hExternalMem ///< [in][release] handle of external memory to be destroyed + ) try { + ur_result_t result = UR_RESULT_SUCCESS; + + ur_bindless_images_release_external_memory_exp_params_t params = { + &hContext, &hDevice, &hExternalMem}; + + auto beforeCallback = reinterpret_cast( + mock::getCallbacks().get_before_callback( + "urBindlessImagesReleaseExternalMemoryExp")); + if (beforeCallback) { + result = beforeCallback(¶ms); + if (result != UR_RESULT_SUCCESS) { + return result; + } + } + + auto replaceCallback = reinterpret_cast( + mock::getCallbacks().get_replace_callback( + "urBindlessImagesReleaseExternalMemoryExp")); + if (replaceCallback) { + result = replaceCallback(¶ms); + } else { + + mock::releaseDummyHandle(hExternalMem); + result = UR_RESULT_SUCCESS; + } + + if (result != UR_RESULT_SUCCESS) { + return result; + } + + auto afterCallback = reinterpret_cast( + mock::getCallbacks().get_after_callback( + "urBindlessImagesReleaseExternalMemoryExp")); if (afterCallback) { return afterCallback(¶ms); } @@ -7861,16 +7936,16 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( ur_device_handle_t hDevice, ///< [in] handle of the device object ur_exp_external_semaphore_type_t semHandleType, ///< [in] type of external memory handle - ur_exp_interop_semaphore_desc_t - *pInteropSemaphoreDesc, ///< [in] the interop semaphore descriptor - ur_exp_interop_semaphore_handle_t * - phInteropSemaphore ///< [out] interop semaphore handle to the external semaphore + ur_exp_external_semaphore_desc_t + *pExternalSemaphoreDesc, ///< [in] the external semaphore descriptor + ur_exp_external_semaphore_handle_t * + phExternalSemaphore ///< [out] external semaphore handle to the external semaphore ) try { ur_result_t result = UR_RESULT_SUCCESS; ur_bindless_images_import_external_semaphore_exp_params_t params = { - &hContext, &hDevice, &semHandleType, &pInteropSemaphoreDesc, - &phInteropSemaphore}; + &hContext, &hDevice, &semHandleType, &pExternalSemaphoreDesc, + &phExternalSemaphore}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( @@ -7889,8 +7964,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( result = replaceCallback(¶ms); } else { - *phInteropSemaphore = - mock::createDummyHandle(); + *phExternalSemaphore = + mock::createDummyHandle(); result = UR_RESULT_SUCCESS; } @@ -7915,13 +7990,13 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( ur_context_handle_t hContext, ///< [in] handle of the context object ur_device_handle_t hDevice, ///< [in] handle of the device object - ur_exp_interop_semaphore_handle_t - hInteropSemaphore ///< [in][release] handle of interop semaphore to be destroyed + ur_exp_external_semaphore_handle_t + hExternalSemaphore ///< [in][release] handle of external semaphore to be destroyed ) try { ur_result_t result = UR_RESULT_SUCCESS; ur_bindless_images_release_external_semaphore_exp_params_t params = { - &hContext, &hDevice, &hInteropSemaphore}; + &hContext, &hDevice, &hExternalSemaphore}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( @@ -7940,7 +8015,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( result = replaceCallback(¶ms); } else { - mock::releaseDummyHandle(hInteropSemaphore); + mock::releaseDummyHandle(hExternalSemaphore); result = UR_RESULT_SUCCESS; } @@ -7964,8 +8039,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( /// @brief Intercept function for urBindlessImagesWaitExternalSemaphoreExp __urdlllocal ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( ur_queue_handle_t hQueue, ///< [in] handle of the queue object - ur_exp_interop_semaphore_handle_t - hSemaphore, ///< [in] interop semaphore handle + ur_exp_external_semaphore_handle_t + hSemaphore, ///< [in] external semaphore handle bool hasWaitValue, ///< [in] indicates whether the samephore is capable and should wait on a ///< certain value. @@ -7981,7 +8056,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -8034,8 +8110,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( /// @brief Intercept function for urBindlessImagesSignalExternalSemaphoreExp __urdlllocal ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( ur_queue_handle_t hQueue, ///< [in] handle of the queue object - ur_exp_interop_semaphore_handle_t - hSemaphore, ///< [in] interop semaphore handle + ur_exp_external_semaphore_handle_t + hSemaphore, ///< [in] external semaphore handle bool hasSignalValue, ///< [in] indicates whether the samephore is capable and should signal on a ///< certain value. @@ -8051,7 +8127,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -8294,16 +8371,37 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( const size_t * pGlobalWorkSize, ///< [in] Global work size to use when executing kernel. const size_t * - pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. + pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. If this + ///< parameter is nullptr, then a local work size will be generated by the + ///< implementation. + uint32_t + numKernelAlternatives, ///< [in] The number of kernel alternatives provided in + ///< phKernelAlternatives. + ur_kernel_handle_t * + phKernelAlternatives, ///< [in][optional][range(0, numKernelAlternatives)] List of kernel handles + ///< that might be used to update the kernel in this + ///< command after the command-buffer is finalized. The default kernel + ///< `hKernel` is implicitly marked as an alternative. It's + ///< invalid to specify it as part of this list. uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * pSyncPoint, ///< [out][optional] Sync point associated with this command. - ur_exp_command_buffer_command_handle_t - *phCommand ///< [out][optional] Handle to this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t * + phCommand ///< [out][optional] Handle to this command. Only available if the + ///< command-buffer is updatable. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -8314,9 +8412,14 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( &pGlobalWorkOffset, &pGlobalWorkSize, &pLocalWorkSize, + &numKernelAlternatives, + &phKernelAlternatives, &numSyncPointsInWaitList, &pSyncPointWaitList, + &numEventsInWaitList, + &phEventWaitList, &pSyncPoint, + &phEvent, &phCommand}; auto beforeCallback = reinterpret_cast( @@ -8336,6 +8439,10 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( result = replaceCallback(¶ms); } else { + // optional output handle + if (phEvent) { + *phEvent = mock::createDummyHandle(); + } // optional output handle if (phCommand) { *phCommand = mock::createDummyHandle< @@ -8373,14 +8480,34 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) try { ur_result_t result = UR_RESULT_SUCCESS; ur_command_buffer_append_usm_memcpy_exp_params_t params = { - &hCommandBuffer, &pDst, &pSrc, &size, &numSyncPointsInWaitList, - &pSyncPointWaitList, &pSyncPoint}; + &hCommandBuffer, + &pDst, + &pSrc, + &size, + &numSyncPointsInWaitList, + &pSyncPointWaitList, + &numEventsInWaitList, + &phEventWaitList, + &pSyncPoint, + &phEvent, + &phCommand}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( @@ -8399,6 +8526,15 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( result = replaceCallback(¶ms); } else { + // optional output handle + if (phEvent) { + *phEvent = mock::createDummyHandle(); + } + // optional output handle + if (phCommand) { + *phCommand = mock::createDummyHandle< + ur_exp_command_buffer_command_handle_t>(); + } result = UR_RESULT_SUCCESS; } @@ -8433,15 +8569,35 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) try { ur_result_t result = UR_RESULT_SUCCESS; ur_command_buffer_append_usm_fill_exp_params_t params = { - &hCommandBuffer, &pMemory, &pPattern, - &patternSize, &size, &numSyncPointsInWaitList, - &pSyncPointWaitList, &pSyncPoint}; + &hCommandBuffer, + &pMemory, + &pPattern, + &patternSize, + &size, + &numSyncPointsInWaitList, + &pSyncPointWaitList, + &numEventsInWaitList, + &phEventWaitList, + &pSyncPoint, + &phEvent, + &phCommand}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( @@ -8460,6 +8616,15 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( result = replaceCallback(¶ms); } else { + // optional output handle + if (phEvent) { + *phEvent = mock::createDummyHandle(); + } + // optional output handle + if (phCommand) { + *phCommand = mock::createDummyHandle< + ur_exp_command_buffer_command_handle_t>(); + } result = UR_RESULT_SUCCESS; } @@ -8494,8 +8659,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -8508,7 +8684,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( &size, &numSyncPointsInWaitList, &pSyncPointWaitList, - &pSyncPoint}; + &numEventsInWaitList, + &phEventWaitList, + &pSyncPoint, + &phEvent, + &phCommand}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( @@ -8527,6 +8707,15 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( result = replaceCallback(¶ms); } else { + // optional output handle + if (phEvent) { + *phEvent = mock::createDummyHandle(); + } + // optional output handle + if (phCommand) { + *phCommand = mock::createDummyHandle< + ur_exp_command_buffer_command_handle_t>(); + } result = UR_RESULT_SUCCESS; } @@ -8561,8 +8750,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -8574,7 +8774,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( &pSrc, &numSyncPointsInWaitList, &pSyncPointWaitList, - &pSyncPoint}; + &numEventsInWaitList, + &phEventWaitList, + &pSyncPoint, + &phEvent, + &phCommand}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( @@ -8593,6 +8797,15 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( result = replaceCallback(¶ms); } else { + // optional output handle + if (phEvent) { + *phEvent = mock::createDummyHandle(); + } + // optional output handle + if (phCommand) { + *phCommand = mock::createDummyHandle< + ur_exp_command_buffer_command_handle_t>(); + } result = UR_RESULT_SUCCESS; } @@ -8626,8 +8839,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -8639,7 +8863,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( &pDst, &numSyncPointsInWaitList, &pSyncPointWaitList, - &pSyncPoint}; + &numEventsInWaitList, + &phEventWaitList, + &pSyncPoint, + &phEvent, + &phCommand}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( @@ -8658,6 +8886,15 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( result = replaceCallback(¶ms); } else { + // optional output handle + if (phEvent) { + *phEvent = mock::createDummyHandle(); + } + // optional output handle + if (phCommand) { + *phCommand = mock::createDummyHandle< + ur_exp_command_buffer_command_handle_t>(); + } result = UR_RESULT_SUCCESS; } @@ -8699,8 +8936,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -8717,7 +8965,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( &dstSlicePitch, &numSyncPointsInWaitList, &pSyncPointWaitList, - &pSyncPoint}; + &numEventsInWaitList, + &phEventWaitList, + &pSyncPoint, + &phEvent, + &phCommand}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( @@ -8736,6 +8988,15 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( result = replaceCallback(¶ms); } else { + // optional output handle + if (phEvent) { + *phEvent = mock::createDummyHandle(); + } + // optional output handle + if (phCommand) { + *phCommand = mock::createDummyHandle< + ur_exp_command_buffer_command_handle_t>(); + } result = UR_RESULT_SUCCESS; } @@ -8783,8 +9044,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -8801,7 +9073,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( &pSrc, &numSyncPointsInWaitList, &pSyncPointWaitList, - &pSyncPoint}; + &numEventsInWaitList, + &phEventWaitList, + &pSyncPoint, + &phEvent, + &phCommand}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( @@ -8820,6 +9096,15 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( result = replaceCallback(¶ms); } else { + // optional output handle + if (phEvent) { + *phEvent = mock::createDummyHandle(); + } + // optional output handle + if (phCommand) { + *phCommand = mock::createDummyHandle< + ur_exp_command_buffer_command_handle_t>(); + } result = UR_RESULT_SUCCESS; } @@ -8865,8 +9150,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -8883,7 +9179,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( &pDst, &numSyncPointsInWaitList, &pSyncPointWaitList, - &pSyncPoint}; + &numEventsInWaitList, + &phEventWaitList, + &pSyncPoint, + &phEvent, + &phCommand}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( @@ -8902,6 +9202,15 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( result = replaceCallback(¶ms); } else { + // optional output handle + if (phEvent) { + *phEvent = mock::createDummyHandle(); + } + // optional output handle + if (phCommand) { + *phCommand = mock::createDummyHandle< + ur_exp_command_buffer_command_handle_t>(); + } result = UR_RESULT_SUCCESS; } @@ -8937,8 +9246,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -8951,7 +9271,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( &size, &numSyncPointsInWaitList, &pSyncPointWaitList, - &pSyncPoint}; + &numEventsInWaitList, + &phEventWaitList, + &pSyncPoint, + &phEvent, + &phCommand}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( @@ -8970,6 +9294,15 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( result = replaceCallback(¶ms); } else { + // optional output handle + if (phEvent) { + *phEvent = mock::createDummyHandle(); + } + // optional output handle + if (phCommand) { + *phCommand = mock::createDummyHandle< + ur_exp_command_buffer_command_handle_t>(); + } result = UR_RESULT_SUCCESS; } @@ -9002,8 +9335,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -9014,7 +9358,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( &flags, &numSyncPointsInWaitList, &pSyncPointWaitList, - &pSyncPoint}; + &numEventsInWaitList, + &phEventWaitList, + &pSyncPoint, + &phEvent, + &phCommand}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( @@ -9033,6 +9381,15 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( result = replaceCallback(¶ms); } else { + // optional output handle + if (phEvent) { + *phEvent = mock::createDummyHandle(); + } + // optional output handle + if (phCommand) { + *phCommand = mock::createDummyHandle< + ur_exp_command_buffer_command_handle_t>(); + } result = UR_RESULT_SUCCESS; } @@ -9065,8 +9422,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -9077,7 +9445,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( &advice, &numSyncPointsInWaitList, &pSyncPointWaitList, - &pSyncPoint}; + &numEventsInWaitList, + &phEventWaitList, + &pSyncPoint, + &phEvent, + &phCommand}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( @@ -9096,6 +9468,15 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( result = replaceCallback(¶ms); } else { + // optional output handle + if (phEvent) { + *phEvent = mock::createDummyHandle(); + } + // optional output handle + if (phCommand) { + *phCommand = mock::createDummyHandle< + ur_exp_command_buffer_command_handle_t>(); + } result = UR_RESULT_SUCCESS; } @@ -9129,7 +9510,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferEnqueueExp( ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command-buffer execution instance. + ///< command-buffer execution instance. If phEventWaitList and phEvent are + ///< not NULL, phEvent must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -9317,6 +9699,107 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urCommandBufferUpdateSignalEventExp +__urdlllocal ur_result_t UR_APICALL urCommandBufferUpdateSignalEventExp( + ur_exp_command_buffer_command_handle_t + hCommand, ///< [in] Handle of the command-buffer command to update. + ur_event_handle_t *phSignalEvent ///< [out] Event to be signaled. + ) try { + ur_result_t result = UR_RESULT_SUCCESS; + + ur_command_buffer_update_signal_event_exp_params_t params = { + &hCommand, &phSignalEvent}; + + auto beforeCallback = reinterpret_cast( + mock::getCallbacks().get_before_callback( + "urCommandBufferUpdateSignalEventExp")); + if (beforeCallback) { + result = beforeCallback(¶ms); + if (result != UR_RESULT_SUCCESS) { + return result; + } + } + + auto replaceCallback = reinterpret_cast( + mock::getCallbacks().get_replace_callback( + "urCommandBufferUpdateSignalEventExp")); + if (replaceCallback) { + result = replaceCallback(¶ms); + } else { + + *phSignalEvent = mock::createDummyHandle(); + result = UR_RESULT_SUCCESS; + } + + if (result != UR_RESULT_SUCCESS) { + return result; + } + + auto afterCallback = reinterpret_cast( + mock::getCallbacks().get_after_callback( + "urCommandBufferUpdateSignalEventExp")); + if (afterCallback) { + return afterCallback(¶ms); + } + + return result; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urCommandBufferUpdateWaitEventsExp +__urdlllocal ur_result_t UR_APICALL urCommandBufferUpdateWaitEventsExp( + ur_exp_command_buffer_command_handle_t + hCommand, ///< [in] Handle of the command-buffer command to update. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. + ) try { + ur_result_t result = UR_RESULT_SUCCESS; + + ur_command_buffer_update_wait_events_exp_params_t params = { + &hCommand, &numEventsInWaitList, &phEventWaitList}; + + auto beforeCallback = reinterpret_cast( + mock::getCallbacks().get_before_callback( + "urCommandBufferUpdateWaitEventsExp")); + if (beforeCallback) { + result = beforeCallback(¶ms); + if (result != UR_RESULT_SUCCESS) { + return result; + } + } + + auto replaceCallback = reinterpret_cast( + mock::getCallbacks().get_replace_callback( + "urCommandBufferUpdateWaitEventsExp")); + if (replaceCallback) { + result = replaceCallback(¶ms); + } else { + + result = UR_RESULT_SUCCESS; + } + + if (result != UR_RESULT_SUCCESS) { + return result; + } + + auto afterCallback = reinterpret_cast( + mock::getCallbacks().get_after_callback( + "urCommandBufferUpdateWaitEventsExp")); + if (afterCallback) { + return afterCallback(¶ms); + } + + return result; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urCommandBufferGetInfoExp __urdlllocal ur_result_t UR_APICALL urCommandBufferGetInfoExp( @@ -9445,17 +9928,16 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that ///< specify the number of local work-items forming a work-group that will ///< execute the kernel function. - ///< If nullptr, the runtime implementation will choose the work-group - ///< size. + ///< If nullptr, the runtime implementation will choose the work-group size. uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -9575,8 +10057,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait - ///< events. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [in,out] return an event object that identifies this particular kernel ///< execution instance. Profiling information can be queried @@ -9584,7 +10065,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` ///< reports the timestamp at the time of the call to this function. ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` - ///< reports the timestamp recorded when the command is executed on the device. + ///< reports the timestamp recorded when the command is executed on the + ///< device. If phEventWaitList and phEvent are not NULL, phEvent must not + ///< refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -9656,7 +10139,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( ///< the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList + ///< array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -10123,7 +10608,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies the work that has - ///< been enqueued in nativeEnqueueFunc. + ///< been enqueued in nativeEnqueueFunc. If phEventWaitList and phEvent are + ///< not NULL, phEvent must not refer to an element of the phEventWaitList array. ) try { ur_result_t result = UR_RESULT_SUCCESS; @@ -10437,7 +10923,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable( pDdiTable->pfnMapExternalArrayExp = driver::urBindlessImagesMapExternalArrayExp; - pDdiTable->pfnReleaseInteropExp = driver::urBindlessImagesReleaseInteropExp; + pDdiTable->pfnMapExternalLinearMemoryExp = + driver::urBindlessImagesMapExternalLinearMemoryExp; + + pDdiTable->pfnReleaseExternalMemoryExp = + driver::urBindlessImagesReleaseExternalMemoryExp; pDdiTable->pfnImportExternalSemaphoreExp = driver::urBindlessImagesImportExternalSemaphoreExp; @@ -10531,6 +11021,12 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( pDdiTable->pfnUpdateKernelLaunchExp = driver::urCommandBufferUpdateKernelLaunchExp; + pDdiTable->pfnUpdateSignalEventExp = + driver::urCommandBufferUpdateSignalEventExp; + + pDdiTable->pfnUpdateWaitEventsExp = + driver::urCommandBufferUpdateWaitEventsExp; + pDdiTable->pfnGetInfoExp = driver::urCommandBufferGetInfoExp; pDdiTable->pfnCommandGetInfoExp = driver::urCommandBufferCommandGetInfoExp; diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt index 9793ab36a0..69f7fff6bd 100644 --- a/source/adapters/native_cpu/CMakeLists.txt +++ b/source/adapters/native_cpu/CMakeLists.txt @@ -42,6 +42,7 @@ add_ur_adapter(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.hpp ) +install_ur_library(${TARGET_NAME}) set_target_properties(${TARGET_NAME} PROPERTIES VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}" @@ -53,6 +54,7 @@ find_package(Threads REQUIRED) target_link_libraries(${TARGET_NAME} PRIVATE ${PROJECT_NAME}::headers ${PROJECT_NAME}::common + ${PROJECT_NAME}::umf Threads::Threads ) diff --git a/source/adapters/native_cpu/command_buffer.cpp b/source/adapters/native_cpu/command_buffer.cpp index fde6c03b86..91044771ec 100644 --- a/source/adapters/native_cpu/command_buffer.cpp +++ b/source/adapters/native_cpu/command_buffer.cpp @@ -49,9 +49,9 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t) { UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ur_exp_command_buffer_handle_t, ur_kernel_handle_t, uint32_t, const size_t *, const size_t *, const size_t *, uint32_t, - const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_command_handle_t *) { + ur_kernel_handle_t *, uint32_t, const ur_exp_command_buffer_sync_point_t *, + uint32_t, const ur_event_handle_t *, ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, ur_exp_command_buffer_command_handle_t *) { detail::ur::die("Experimental Command-buffer feature is not " "implemented for the NativeCPU adapter."); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; @@ -59,8 +59,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( ur_exp_command_buffer_handle_t, void *, const void *, size_t, uint32_t, - const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *) { + const ur_exp_command_buffer_sync_point_t *, uint32_t, + const ur_event_handle_t *, ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, ur_exp_command_buffer_command_handle_t *) { detail::ur::die("Experimental Command-buffer feature is not " "implemented for the NativeCPU adapter."); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; @@ -69,7 +70,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( ur_exp_command_buffer_handle_t, ur_mem_handle_t, ur_mem_handle_t, size_t, size_t, size_t, uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *) { + uint32_t, const ur_event_handle_t *, ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, ur_exp_command_buffer_command_handle_t *) { detail::ur::die("Experimental Command-buffer feature is not " "implemented for the NativeCPU adapter."); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; @@ -79,7 +81,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( ur_exp_command_buffer_handle_t, ur_mem_handle_t, ur_mem_handle_t, ur_rect_offset_t, ur_rect_offset_t, ur_rect_region_t, size_t, size_t, size_t, size_t, uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *) { + uint32_t, const ur_event_handle_t *, ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, ur_exp_command_buffer_command_handle_t *) { detail::ur::die("Experimental Command-buffer feature is not " "implemented for the NativeCPU adapter."); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; @@ -89,7 +92,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( ur_exp_command_buffer_handle_t, ur_mem_handle_t, size_t, size_t, const void *, uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *) { + uint32_t, const ur_event_handle_t *, ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, ur_exp_command_buffer_command_handle_t *) { detail::ur::die("Experimental Command-buffer feature is not " "implemented for the NativeCPU adapter."); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; @@ -98,8 +102,9 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( ur_exp_command_buffer_handle_t, ur_mem_handle_t, size_t, size_t, void *, - uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *) { + uint32_t, const ur_exp_command_buffer_sync_point_t *, uint32_t, + const ur_event_handle_t *, ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, ur_exp_command_buffer_command_handle_t *) { detail::ur::die("Experimental Command-buffer feature is not " "implemented for the NativeCPU adapter."); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; @@ -109,8 +114,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( ur_exp_command_buffer_handle_t, ur_mem_handle_t, ur_rect_offset_t, ur_rect_offset_t, ur_rect_region_t, size_t, size_t, size_t, size_t, void *, - uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *) { + uint32_t, const ur_exp_command_buffer_sync_point_t *, uint32_t, + const ur_event_handle_t *, ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, ur_exp_command_buffer_command_handle_t *) { detail::ur::die("Experimental Command-buffer feature is not " "implemented for the NativeCPU adapter."); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; @@ -120,8 +126,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( ur_exp_command_buffer_handle_t, ur_mem_handle_t, ur_rect_offset_t, ur_rect_offset_t, ur_rect_region_t, size_t, size_t, size_t, size_t, void *, - uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *) { + uint32_t, const ur_exp_command_buffer_sync_point_t *, uint32_t, + const ur_event_handle_t *, ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, ur_exp_command_buffer_command_handle_t *) { detail::ur::die("Experimental Command-buffer feature is not " "implemented for the NativeCPU adapter."); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; @@ -138,29 +145,33 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( ur_exp_command_buffer_handle_t, ur_mem_handle_t, const void *, size_t, size_t, size_t, uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *) { + uint32_t, const ur_event_handle_t *, ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, ur_exp_command_buffer_command_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( ur_exp_command_buffer_handle_t, void *, const void *, size_t, size_t, - uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *) { + uint32_t, const ur_exp_command_buffer_sync_point_t *, uint32_t, + const ur_event_handle_t *, ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, ur_exp_command_buffer_command_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( ur_exp_command_buffer_handle_t, const void *, size_t, ur_usm_migration_flags_t, uint32_t, - const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *) { + const ur_exp_command_buffer_sync_point_t *, uint32_t, + const ur_event_handle_t *, ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, ur_exp_command_buffer_command_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( ur_exp_command_buffer_handle_t, const void *, size_t, ur_usm_advice_flags_t, - uint32_t, const ur_exp_command_buffer_sync_point_t *, - ur_exp_command_buffer_sync_point_t *) { + uint32_t, const ur_exp_command_buffer_sync_point_t *, uint32_t, + const ur_event_handle_t *, ur_exp_command_buffer_sync_point_t *, + ur_event_handle_t *, ur_exp_command_buffer_command_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -180,6 +191,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateSignalEventExp( + ur_exp_command_buffer_command_handle_t, ur_event_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urCommandBufferUpdateWaitEventsExp(ur_exp_command_buffer_command_handle_t, + uint32_t, const ur_event_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferGetInfoExp( ur_exp_command_buffer_handle_t, ur_exp_command_buffer_info_t, size_t, void *, size_t *) { diff --git a/source/adapters/native_cpu/context.hpp b/source/adapters/native_cpu/context.hpp index 30bfb31d71..c59ab4eafb 100644 --- a/source/adapters/native_cpu/context.hpp +++ b/source/adapters/native_cpu/context.hpp @@ -10,13 +10,142 @@ #pragma once +#include +#include #include #include "common.hpp" #include "device.hpp" +#include "ur/ur.hpp" + +namespace native_cpu { +struct usm_alloc_info { + ur_usm_type_t type; + const void *base_ptr; + size_t size; + ur_device_handle_t device; + ur_usm_pool_handle_t pool; + + // We store a pointer to the actual allocation because it is needed when + // freeing memory. + void *base_alloc_ptr; + constexpr usm_alloc_info(ur_usm_type_t type, const void *base_ptr, + size_t size, ur_device_handle_t device, + ur_usm_pool_handle_t pool, void *base_alloc_ptr) + : type(type), base_ptr(base_ptr), size(size), device(device), pool(pool), + base_alloc_ptr(base_alloc_ptr) {} +}; + +constexpr usm_alloc_info usm_alloc_info_null_entry(UR_USM_TYPE_UNKNOWN, nullptr, + 0, nullptr, nullptr, + nullptr); + +constexpr size_t alloc_header_size = sizeof(usm_alloc_info); + +// Computes the padding that we need to add to ensure the +// pointer returned by UR is aligned as the user requested. +static size_t get_padding(uint32_t alignment) { + assert(alignment >= alignof(usm_alloc_info) && + "memory not aligned to usm_alloc_info"); + if (!alignment || alloc_header_size % alignment == 0) + return 0; + size_t padd = 0; + if (alignment <= alloc_header_size) { + padd = alignment - (alloc_header_size % alignment); + } else { + padd = alignment - alloc_header_size; + } + return padd; +} + +// In order to satisfy the MemAllocInfo queries we allocate extra memory +// for the native_cpu::usm_alloc_info struct. +// To satisfy the alignment requirements we "pad" the memory +// allocation so that the pointer returned to the user +// always satisfies (ptr % align) == 0. +static inline void *malloc_impl(uint32_t alignment, size_t size) { + void *ptr = nullptr; + assert(alignment >= alignof(usm_alloc_info) && + "memory not aligned to usm_alloc_info"); +#ifdef _MSC_VER + ptr = _aligned_malloc(alloc_header_size + get_padding(alignment) + size, + alignment); + +#else + ptr = std::aligned_alloc(alignment, + alloc_header_size + get_padding(alignment) + size); +#endif + return ptr; +} + +// The info struct is retrieved by subtracting its size from the pointer +// returned to the user. +static inline uint8_t *get_alloc_info_addr(const void *ptr) { + return (uint8_t *)const_cast(ptr) - alloc_header_size; +} + +static usm_alloc_info get_alloc_info(void *ptr) { + return *(usm_alloc_info *)get_alloc_info_addr(ptr); +} + +} // namespace native_cpu struct ur_context_handle_t_ : RefCounted { ur_context_handle_t_(ur_device_handle_t_ *phDevices) : _device{phDevices} {} ur_device_handle_t _device; + + ur_result_t remove_alloc(void *ptr) { + std::lock_guard lock(alloc_mutex); + const native_cpu::usm_alloc_info &info = native_cpu::get_alloc_info(ptr); + UR_ASSERT(info.type != UR_USM_TYPE_UNKNOWN, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); +#ifdef _MSC_VER + _aligned_free(info.base_alloc_ptr); +#else + free(info.base_alloc_ptr); +#endif + allocations.erase(ptr); + return UR_RESULT_SUCCESS; + } + + // Note this is made non-const to access the mutex + const native_cpu::usm_alloc_info &get_alloc_info_entry(const void *ptr) { + std::lock_guard lock(alloc_mutex); + auto it = allocations.find(ptr); + if (it == allocations.end()) { + return native_cpu::usm_alloc_info_null_entry; + } + + return *(native_cpu::usm_alloc_info *)native_cpu::get_alloc_info_addr(ptr); + } + + void *add_alloc(uint32_t alignment, ur_usm_type_t type, size_t size, + ur_usm_pool_handle_t pool) { + std::lock_guard lock(alloc_mutex); + // We need to ensure that we align to at least alignof(usm_alloc_info), + // otherwise its start address may be unaligned. + alignment = + std::max(alignment, alignof(native_cpu::usm_alloc_info)); + void *alloc = native_cpu::malloc_impl(alignment, size); + if (!alloc) + return nullptr; + // Compute the address of the pointer that we'll return to the user. + void *ptr = native_cpu::alloc_header_size + + native_cpu::get_padding(alignment) + (uint8_t *)alloc; + uint8_t *info_addr = native_cpu::get_alloc_info_addr(ptr); + if (!info_addr) + return nullptr; + // Do a placement new of the alloc_info to avoid allocation and copy + auto info = new (info_addr) + native_cpu::usm_alloc_info(type, ptr, size, this->_device, pool, alloc); + if (!info) + return nullptr; + allocations.insert(ptr); + return ptr; + } + +private: + std::mutex alloc_mutex; + std::set allocations; }; diff --git a/source/adapters/native_cpu/device.cpp b/source/adapters/native_cpu/device.cpp index 64d99927ae..2a829a82e1 100644 --- a/source/adapters/native_cpu/device.cpp +++ b/source/adapters/native_cpu/device.cpp @@ -12,6 +12,72 @@ #include "platform.hpp" +#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__) +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#endif + +#ifdef __linux__ +#include +#include + +#include +#include +#endif + +#ifdef __APPLE__ +#include +#include +#endif + +#ifdef __MCOS_POSIX__ +#include +#endif + +uint64_t os_memory_total_size() { +#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__) + MEMORYSTATUSEX status; + status.dwLength = sizeof(status); + if (GlobalMemoryStatusEx(&status)) { + return static_cast(status.ullTotalPhys); + } else { + return 0; + } +#elif defined(__APPLE__) + // query the physical memory size by name, name documented in + // https://opensource.apple.com/source/xnu/xnu-792.12.6/libkern/libkern/sysctl.h + uint64_t memsize; + size_t size = sizeof(uint64_t); + if (sysctlbyname("hw.memsize", &memsize, &size, nullptr, 0)) { + return 0; + } + return memsize; +#elif defined(__linux__) + struct sysinfo info; + if (0 == sysinfo(&info)) { + return static_cast(info.totalram) * + static_cast(info.mem_unit); + } else { + return 0; + } +#elif defined(__MCOS_POSIX__) + return emcos::get_device_total_memory_size(); +#else +#error Unknown platform! +#endif +} + +static uint64_t os_memory_bounded_size() { + const uint64_t size = os_memory_total_size(); + // Limit the memory size to what fits in a size_t, this is necessary when + // compiling for 32 bits on a 64 bits host + return std::numeric_limits::max() >= size + ? size + : std::numeric_limits::max(); +} + UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform, ur_device_type_t DeviceType, uint32_t NumEntries, @@ -94,9 +160,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_VERSION: return ReturnValue("0.1"); case UR_DEVICE_INFO_COMPILER_AVAILABLE: - return ReturnValue(bool{false}); + return ReturnValue(bool{true}); case UR_DEVICE_INFO_LINKER_AVAILABLE: - return ReturnValue(bool{false}); + return ReturnValue(bool{true}); case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: return ReturnValue(static_cast(hDevice->tp.num_threads())); case UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: @@ -114,6 +180,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE: // TODO: provide a mechanism to estimate/configure this. return ReturnValue(size_t{2048}); + case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: + // Set the max sub groups to be the same as the max work group size. + return ReturnValue(uint32_t{2048}); case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: // Imported from level_zero return ReturnValue(uint32_t{8}); @@ -223,8 +292,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // TODO : CHECK return ReturnValue(uint64_t{0}); case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: - // TODO : CHECK - return ReturnValue(uint64_t{32768}); + return ReturnValue(hDevice->mem_size); case UR_DEVICE_INFO_LOCAL_MEM_SIZE: // TODO : CHECK return ReturnValue(uint64_t{32768}); @@ -252,9 +320,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(bool{false}); case UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: return ReturnValue(ur_device_affinity_domain_flags_t{0}); - case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: - // TODO : CHECK - return ReturnValue(uint64_t{0}); + case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: { + size_t Global = hDevice->mem_size; + + auto QuarterGlobal = static_cast(Global / 4u); + + auto MaxAlloc = std::max(std::min(1024u * 1024u * 1024u, QuarterGlobal), + 32u * 1024u * 1024u); + + return ReturnValue(uint64_t{MaxAlloc}); + } case UR_DEVICE_INFO_EXECUTION_CAPABILITIES: // TODO : CHECK return ReturnValue(ur_device_exec_capability_flags_t{ @@ -285,18 +360,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU: case UR_DEVICE_INFO_UUID: case UR_DEVICE_INFO_DEVICE_ID: - case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: case UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: case UR_DEVICE_INFO_IL_VERSION: case UR_DEVICE_INFO_MAX_WORK_GROUPS_3D: case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: case UR_DEVICE_INFO_MEMORY_BUS_WIDTH: return UR_RESULT_ERROR_INVALID_VALUE; + case UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: { + // Currently for Native CPU fences are implemented using OCK + // builtins, so we have different capabilities than atomic operations + ur_memory_order_capability_flags_t Capabilities = + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL | + UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST; + return ReturnValue(Capabilities); + } case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { ur_memory_order_capability_flags_t Capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED; return ReturnValue(Capabilities); } + case UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { uint64_t Capabilities = UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | @@ -316,8 +402,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(false); case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: - case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: + case UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP: return ReturnValue(false); + case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP: + return ReturnValue( + static_cast(0)); case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: return ReturnValue(false); @@ -366,11 +455,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( } UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( - ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform, + ur_native_handle_t hNativeDevice, ur_adapter_handle_t hAdapter, const ur_device_native_properties_t *pProperties, ur_device_handle_t *phDevice) { std::ignore = hNativeDevice; - std::ignore = hPlatform; + std::ignore = hAdapter; std::ignore = pProperties; std::ignore = phDevice; @@ -420,3 +509,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( // No image can be loaded for the given device return UR_RESULT_ERROR_INVALID_BINARY; } + +ur_device_handle_t_::ur_device_handle_t_(ur_platform_handle_t ArgPlt) + : mem_size(os_memory_bounded_size()), Platform(ArgPlt) {} diff --git a/source/adapters/native_cpu/device.hpp b/source/adapters/native_cpu/device.hpp index 01245410c9..2308c1a7f4 100644 --- a/source/adapters/native_cpu/device.hpp +++ b/source/adapters/native_cpu/device.hpp @@ -15,7 +15,8 @@ struct ur_device_handle_t_ { native_cpu::threadpool_t tp; - ur_device_handle_t_(ur_platform_handle_t ArgPlt) : Platform(ArgPlt) {} + ur_device_handle_t_(ur_platform_handle_t ArgPlt); + const uint64_t mem_size; ur_platform_handle_t Platform; }; diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp index 835a7febcf..33d8c35c36 100644 --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -81,11 +81,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( DIE_NO_IMPLEMENTATION; } - // Check reqd_work_group_size - if (hKernel->hasReqdWGSize() && pLocalWorkSize != nullptr) { - const auto &Reqd = hKernel->getReqdWGSize(); + // Check reqd_work_group_size and other kernel constraints + if (pLocalWorkSize != nullptr) { + uint64_t TotalNumWIs = 1; for (uint32_t Dim = 0; Dim < workDim; Dim++) { - if (pLocalWorkSize[Dim] != Reqd[Dim]) { + TotalNumWIs *= pLocalWorkSize[Dim]; + if (auto Reqd = hKernel->getReqdWGSize(); + Reqd && pLocalWorkSize[Dim] != Reqd.value()[Dim]) { + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + if (auto MaxWG = hKernel->getMaxWGSize(); + MaxWG && pLocalWorkSize[Dim] > MaxWG.value()[Dim]) { + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + } + if (auto MaxLinearWG = hKernel->getMaxLinearWGSize()) { + if (TotalNumWIs > MaxLinearWG) { return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; } } @@ -511,8 +522,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( UR_ASSERT(ptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(pPattern, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(size % patternSize == 0 || patternSize > size, - UR_RESULT_ERROR_INVALID_SIZE); + UR_ASSERT(patternSize != 0, UR_RESULT_ERROR_INVALID_SIZE) + UR_ASSERT(size != 0, UR_RESULT_ERROR_INVALID_SIZE) + UR_ASSERT(patternSize < size, UR_RESULT_ERROR_INVALID_SIZE) + UR_ASSERT(size % patternSize == 0, UR_RESULT_ERROR_INVALID_SIZE) + // TODO: add check for allocation size once the query is supported switch (patternSize) { case 1: @@ -522,7 +536,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( const auto pattern = *static_cast(pPattern); auto *start = reinterpret_cast(ptr); auto *end = - reinterpret_cast(reinterpret_cast(ptr) + size); + reinterpret_cast(reinterpret_cast(ptr) + size); std::fill(start, end, pattern); break; } @@ -530,7 +544,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( const auto pattern = *static_cast(pPattern); auto *start = reinterpret_cast(ptr); auto *end = - reinterpret_cast(reinterpret_cast(ptr) + size); + reinterpret_cast(reinterpret_cast(ptr) + size); std::fill(start, end, pattern); break; } @@ -538,17 +552,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( const auto pattern = *static_cast(pPattern); auto *start = reinterpret_cast(ptr); auto *end = - reinterpret_cast(reinterpret_cast(ptr) + size); + reinterpret_cast(reinterpret_cast(ptr) + size); std::fill(start, end, pattern); break; } - default: - for (unsigned int step{0}; step < size; ++step) { - auto *dest = reinterpret_cast(reinterpret_cast(ptr) + - step * patternSize); + default: { + for (unsigned int step{0}; step < size; step += patternSize) { + auto *dest = + reinterpret_cast(reinterpret_cast(ptr) + step); memcpy(dest, pPattern, patternSize); } } + } return UR_RESULT_SUCCESS; } @@ -583,7 +598,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( std::ignore = phEventWaitList; std::ignore = phEvent; - DIE_NO_IMPLEMENTATION; + // TODO: properly implement USM prefetch + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL @@ -595,7 +611,8 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, std::ignore = advice; std::ignore = phEvent; - DIE_NO_IMPLEMENTATION; + // TODO: properly implement USM advise + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill2D( diff --git a/source/adapters/native_cpu/image.cpp b/source/adapters/native_cpu/image.cpp index 3b777ba7fd..d89990ed10 100644 --- a/source/adapters/native_cpu/image.cpp +++ b/source/adapters/native_cpu/image.cpp @@ -117,8 +117,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( [[maybe_unused]] ur_context_handle_t hContext, [[maybe_unused]] ur_device_handle_t hDevice, [[maybe_unused]] size_t size, [[maybe_unused]] ur_exp_external_mem_type_t memHandleType, - [[maybe_unused]] ur_exp_interop_mem_desc_t *pInteropMemDesc, - [[maybe_unused]] ur_exp_interop_mem_handle_t *phInteropMem) { + [[maybe_unused]] ur_exp_external_mem_desc_t *pExternalMemDesc, + [[maybe_unused]] ur_exp_external_mem_handle_t *phExternalMem) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -127,15 +127,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( [[maybe_unused]] ur_device_handle_t hDevice, [[maybe_unused]] const ur_image_format_t *pImageFormat, [[maybe_unused]] const ur_image_desc_t *pImageDesc, - [[maybe_unused]] ur_exp_interop_mem_handle_t hInteropMem, + [[maybe_unused]] ur_exp_external_mem_handle_t hExternalMem, [[maybe_unused]] ur_exp_image_mem_native_handle_t *phImageMem) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp( [[maybe_unused]] ur_context_handle_t hContext, [[maybe_unused]] ur_device_handle_t hDevice, - [[maybe_unused]] ur_exp_interop_mem_handle_t hInteropMem) { + [[maybe_unused]] uint64_t offset, [[maybe_unused]] uint64_t size, + [[maybe_unused]] ur_exp_external_mem_handle_t hExternalMem, + [[maybe_unused]] void **phRetMem) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp( + [[maybe_unused]] ur_context_handle_t hContext, + [[maybe_unused]] ur_device_handle_t hDevice, + [[maybe_unused]] ur_exp_external_mem_handle_t hExternalMem) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -143,21 +152,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( [[maybe_unused]] ur_context_handle_t hContext, [[maybe_unused]] ur_device_handle_t hDevice, [[maybe_unused]] ur_exp_external_semaphore_type_t semHandleType, - [[maybe_unused]] ur_exp_interop_semaphore_desc_t *pInteropSemaphoreDesc, - [[maybe_unused]] ur_exp_interop_semaphore_handle_t *phInteropSemaphore) { + [[maybe_unused]] ur_exp_external_semaphore_desc_t *pExternalSemaphoreDesc, + [[maybe_unused]] ur_exp_external_semaphore_handle_t *phExternalSemaphore) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( [[maybe_unused]] ur_context_handle_t hContext, [[maybe_unused]] ur_device_handle_t hDevice, - [[maybe_unused]] ur_exp_interop_semaphore_handle_t hInteropSemaphore) { + [[maybe_unused]] ur_exp_external_semaphore_handle_t hExternalSemaphore) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( [[maybe_unused]] ur_queue_handle_t hQueue, - [[maybe_unused]] ur_exp_interop_semaphore_handle_t hSemaphore, + [[maybe_unused]] ur_exp_external_semaphore_handle_t hSemaphore, [[maybe_unused]] bool hasValue, [[maybe_unused]] uint64_t waitValue, [[maybe_unused]] uint32_t numEventsInWaitList, [[maybe_unused]] const ur_event_handle_t *phEventWaitList, @@ -167,7 +176,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( [[maybe_unused]] ur_queue_handle_t hQueue, - [[maybe_unused]] ur_exp_interop_semaphore_handle_t hSemaphore, + [[maybe_unused]] ur_exp_external_semaphore_handle_t hSemaphore, [[maybe_unused]] bool hasValue, [[maybe_unused]] uint64_t signalValue, [[maybe_unused]] uint32_t numEventsInWaitList, [[maybe_unused]] const ur_event_handle_t *phEventWaitList, diff --git a/source/adapters/native_cpu/kernel.cpp b/source/adapters/native_cpu/kernel.cpp index 23a65eb03b..af8906245c 100644 --- a/source/adapters/native_cpu/kernel.cpp +++ b/source/adapters/native_cpu/kernel.cpp @@ -31,14 +31,25 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName, ur_kernel_handle_t_ *kernel; // Set reqd_work_group_size for kernel if needed + std::optional ReqdWG; const auto &ReqdMap = hProgram->KernelReqdWorkGroupSizeMD; - auto ReqdIt = ReqdMap.find(pKernelName); - if (ReqdIt != ReqdMap.end()) { - kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f, ReqdIt->second); - } else { - kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f); + if (auto ReqdIt = ReqdMap.find(pKernelName); ReqdIt != ReqdMap.end()) { + ReqdWG = ReqdIt->second; } + std::optional MaxWG; + const auto &MaxMap = hProgram->KernelMaxWorkGroupSizeMD; + if (auto MaxIt = MaxMap.find(pKernelName); MaxIt != MaxMap.end()) { + MaxWG = MaxIt->second; + } + std::optional MaxLinearWG; + const auto &MaxLinMap = hProgram->KernelMaxLinearWorkGroupSizeMD; + if (auto MaxLIt = MaxLinMap.find(pKernelName); MaxLIt != MaxLinMap.end()) { + MaxLinearWG = MaxLIt->second; + } + kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f, ReqdWG, MaxWG, + MaxLinearWG); + *phKernel = kernel; return UR_RESULT_SUCCESS; @@ -148,6 +159,10 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, int bytes = 0; return returnValue(static_cast(bytes)); } + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE: + case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE: + // FIXME: could be added + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: break; @@ -271,7 +286,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants( std::ignore = count; std::ignore = pSpecConstants; - DIE_NO_IMPLEMENTATION + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( diff --git a/source/adapters/native_cpu/kernel.hpp b/source/adapters/native_cpu/kernel.hpp index b5728fa8b2..084a0ee695 100644 --- a/source/adapters/native_cpu/kernel.hpp +++ b/source/adapters/native_cpu/kernel.hpp @@ -41,15 +41,14 @@ struct ur_kernel_handle_t_ : RefCounted { ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name, nativecpu_task_t subhandler) - : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)}, - HasReqdWGSize(false) {} + : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)} {} ur_kernel_handle_t_(const ur_kernel_handle_t_ &other) : hProgram(other.hProgram), _name(other._name), _subhandler(other._subhandler), _args(other._args), _localArgInfo(other._localArgInfo), _localMemPool(other._localMemPool), _localMemPoolSize(other._localMemPoolSize), - HasReqdWGSize(other.HasReqdWGSize), ReqdWGSize(other.ReqdWGSize) { + ReqdWGSize(other.ReqdWGSize) { incrementReferenceCount(); } @@ -60,9 +59,12 @@ struct ur_kernel_handle_t_ : RefCounted { } ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name, nativecpu_task_t subhandler, - const native_cpu::ReqdWGSize_t &ReqdWGSize) + std::optional ReqdWGSize, + std::optional MaxWGSize, + std::optional MaxLinearWGSize) : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)}, - HasReqdWGSize(true), ReqdWGSize(ReqdWGSize) {} + ReqdWGSize(ReqdWGSize), MaxWGSize(MaxWGSize), + MaxLinearWGSize(MaxLinearWGSize) {} ur_program_handle_t hProgram; std::string _name; @@ -70,9 +72,13 @@ struct ur_kernel_handle_t_ : RefCounted { std::vector _args; std::vector _localArgInfo; - bool hasReqdWGSize() const { return HasReqdWGSize; } + std::optional getReqdWGSize() const { + return ReqdWGSize; + } + + std::optional getMaxWGSize() const { return MaxWGSize; } - const native_cpu::ReqdWGSize_t &getReqdWGSize() const { return ReqdWGSize; } + std::optional getMaxLinearWGSize() const { return MaxLinearWGSize; } void updateMemPool(size_t numParallelThreads) { // compute requested size. @@ -103,6 +109,7 @@ struct ur_kernel_handle_t_ : RefCounted { private: char *_localMemPool = nullptr; size_t _localMemPoolSize = 0; - bool HasReqdWGSize; - native_cpu::ReqdWGSize_t ReqdWGSize; + std::optional ReqdWGSize = std::nullopt; + std::optional MaxWGSize = std::nullopt; + std::optional MaxLinearWGSize = std::nullopt; }; diff --git a/source/adapters/native_cpu/memory.cpp b/source/adapters/native_cpu/memory.cpp index 1f8a927c67..ddf93e44bc 100644 --- a/source/adapters/native_cpu/memory.cpp +++ b/source/adapters/native_cpu/memory.cpp @@ -46,7 +46,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( UR_ASSERT(size != 0, UR_RESULT_ERROR_INVALID_BUFFER_SIZE); const bool useHostPtr = flags & UR_MEM_FLAG_USE_HOST_POINTER; - const bool copyHostPtr = flags & UR_MEM_FLAG_USE_HOST_POINTER; + const bool copyHostPtr = flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER; ur_mem_handle_t_ *retMem; diff --git a/source/adapters/native_cpu/platform.cpp b/source/adapters/native_cpu/platform.cpp index 8d650764c1..840f18f8b3 100644 --- a/source/adapters/native_cpu/platform.cpp +++ b/source/adapters/native_cpu/platform.cpp @@ -92,7 +92,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetBackendOption( std::ignore = pFrontendOption; std::ignore = ppPlatformOption; - CONTINUE_NO_IMPLEMENTATION; + std::ignore = hPlatform; + using namespace std::literals; + if (pFrontendOption == nullptr) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + if (pFrontendOption == "-O0"sv || pFrontendOption == "-O1"sv || + pFrontendOption == "-O2"sv || pFrontendOption == "-O3"sv || + pFrontendOption == ""sv) { + *ppPlatformOption = ""; + return UR_RESULT_SUCCESS; + } + return UR_RESULT_ERROR_INVALID_VALUE; } UR_APIEXPORT ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( diff --git a/source/adapters/native_cpu/program.cpp b/source/adapters/native_cpu/program.cpp index 77edd83bce..02ddda0b50 100644 --- a/source/adapters/native_cpu/program.cpp +++ b/source/adapters/native_cpu/program.cpp @@ -29,8 +29,9 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, DIE_NO_IMPLEMENTATION } -static ur_result_t getReqdWGSize(const ur_program_metadata_t &MetadataElement, - native_cpu::ReqdWGSize_t &res) { +static ur_result_t +deserializeWGMetadata(const ur_program_metadata_t &MetadataElement, + native_cpu::WGSize_t &res, std::uint32_t DefaultVal) { size_t MDElemsSize = MetadataElement.size - sizeof(std::uint64_t); // Expect between 1 and 3 32-bit integer values. @@ -43,12 +44,12 @@ static ur_result_t getReqdWGSize(const ur_program_metadata_t &MetadataElement, const char *ValuePtr = reinterpret_cast(MetadataElement.value.pData) + sizeof(std::uint64_t); - // Read values and pad with 1's for values not present. - std::uint32_t ReqdWorkGroupElements[] = {1, 1, 1}; - std::memcpy(ReqdWorkGroupElements, ValuePtr, MDElemsSize); - std::get<0>(res) = ReqdWorkGroupElements[0]; - std::get<1>(res) = ReqdWorkGroupElements[1]; - std::get<2>(res) = ReqdWorkGroupElements[2]; + // Read values and pad with a default value for missing elements. + std::uint32_t WorkGroupElements[] = {DefaultVal, DefaultVal, DefaultVal}; + std::memcpy(WorkGroupElements, ValuePtr, MDElemsSize); + std::get<0>(res) = WorkGroupElements[0]; + std::get<1>(res) = WorkGroupElements[1]; + std::get<2>(res) = WorkGroupElements[2]; return UR_RESULT_SUCCESS; } @@ -71,13 +72,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( const auto &mdNode = pProperties->pMetadatas[i]; std::string mdName(mdNode.pName); auto [Prefix, Tag] = splitMetadataName(mdName); - if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) { - native_cpu::ReqdWGSize_t reqdWGSize; - auto res = getReqdWGSize(mdNode, reqdWGSize); + if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE || + Tag == __SYCL_UR_PROGRAM_METADATA_TAG_MAX_WORK_GROUP_SIZE) { + bool isReqd = + Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE; + native_cpu::WGSize_t wgSizeProp; + auto res = deserializeWGMetadata( + mdNode, wgSizeProp, + isReqd ? 1 : std::numeric_limits::max()); if (res != UR_RESULT_SUCCESS) { return res; } - hProgram->KernelReqdWorkGroupSizeMD[Prefix] = std::move(reqdWGSize); + (isReqd ? hProgram->KernelReqdWorkGroupSizeMD + : hProgram->KernelMaxWorkGroupSizeMD)[Prefix] = + std::move(wgSizeProp); + } else if (Tag == + __SYCL_UR_PROGRAM_METADATA_TAG_MAX_LINEAR_WORK_GROUP_SIZE) { + hProgram->KernelMaxLinearWorkGroupSizeMD[Prefix] = mdNode.value.data64; } } } @@ -112,7 +123,9 @@ urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram, std::ignore = hProgram; std::ignore = pOptions; - DIE_NO_IMPLEMENTATION + // Currently for Native CPU the program is offline compiled, so + // urProgramCompile is a no-op. + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL @@ -127,21 +140,27 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count, std::ignore = phPrograms; std::ignore = pOptions; - DIE_NO_IMPLEMENTATION + // Currently for Native CPU the program is already linked and all its + // symbols are resolved, so this is a no-op. + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urProgramCompileExp(ur_program_handle_t, uint32_t, ur_device_handle_t *, const char *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + // Currently for Native CPU the program is offline compiled, so + // urProgramCompile is a no-op. + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp(ur_program_handle_t, uint32_t, ur_device_handle_t *, const char *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + // Currently for Native CPU the program is offline compiled and linked, + // so urProgramBuild is a no-op. + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urProgramLinkExp( @@ -150,7 +169,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramLinkExp( if (nullptr != phProgram) { *phProgram = nullptr; } - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + // Currently for Native CPU the program is already linked and all its + // symbols are resolved, so this is a no-op. + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL @@ -204,8 +225,6 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName, return returnValue(1u); case UR_PROGRAM_INFO_DEVICES: return returnValue(hProgram->_ctx->_device); - case UR_PROGRAM_INFO_SOURCE: - return returnValue(nullptr); case UR_PROGRAM_INFO_BINARY_SIZES: return returnValue("foo"); case UR_PROGRAM_INFO_BINARIES: @@ -213,6 +232,8 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName, case UR_PROGRAM_INFO_KERNEL_NAMES: { return returnValue("foo"); } + case UR_PROGRAM_INFO_IL: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: break; } diff --git a/source/adapters/native_cpu/program.hpp b/source/adapters/native_cpu/program.hpp index e85749a7b2..d58412751e 100644 --- a/source/adapters/native_cpu/program.hpp +++ b/source/adapters/native_cpu/program.hpp @@ -18,7 +18,7 @@ #include namespace native_cpu { -using ReqdWGSize_t = std::array; +using WGSize_t = std::array; } struct ur_program_handle_t_ : RefCounted { @@ -36,8 +36,11 @@ struct ur_program_handle_t_ : RefCounted { }; std::map _kernels; - std::unordered_map + std::unordered_map KernelReqdWorkGroupSizeMD; + std::unordered_map + KernelMaxWorkGroupSizeMD; + std::unordered_map KernelMaxLinearWorkGroupSizeMD; }; // The nativecpu_entry struct is also defined as LLVM-IR in the diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp index 55c32eb84b..2f2f79cd5a 100644 --- a/source/adapters/native_cpu/threadpool.hpp +++ b/source/adapters/native_cpu/threadpool.hpp @@ -81,8 +81,11 @@ class worker_thread { // Waits for all tasks to finish and destroys the worker thread inline void stop() { - m_isRunning.store(false, std::memory_order_release); - m_startWorkCondition.notify_all(); + { + std::lock_guard lock(m_workMutex); + m_isRunning.store(false, std::memory_order_release); + m_startWorkCondition.notify_all(); + } if (m_worker.joinable()) { // Wait for the worker thread to finish handling the task queue m_worker.join(); diff --git a/source/adapters/native_cpu/ur_interface_loader.cpp b/source/adapters/native_cpu/ur_interface_loader.cpp index 32bc5e9c46..94c6c4a03e 100644 --- a/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/source/adapters/native_cpu/ur_interface_loader.cpp @@ -290,6 +290,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( pDdiTable->pfnCommandGetInfoExp = urCommandBufferCommandGetInfoExp; pDdiTable->pfnReleaseCommandExp = urCommandBufferReleaseCommandExp; pDdiTable->pfnRetainCommandExp = urCommandBufferRetainCommandExp; + pDdiTable->pfnUpdateWaitEventsExp = urCommandBufferUpdateWaitEventsExp; + pDdiTable->pfnUpdateSignalEventExp = urCommandBufferUpdateSignalEventExp; return retVal; } @@ -329,7 +331,10 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable( pDdiTable->pfnImportExternalMemoryExp = urBindlessImagesImportExternalMemoryExp; pDdiTable->pfnMapExternalArrayExp = urBindlessImagesMapExternalArrayExp; - pDdiTable->pfnReleaseInteropExp = urBindlessImagesReleaseInteropExp; + pDdiTable->pfnMapExternalLinearMemoryExp = + urBindlessImagesMapExternalLinearMemoryExp; + pDdiTable->pfnReleaseExternalMemoryExp = + urBindlessImagesReleaseExternalMemoryExp; pDdiTable->pfnImportExternalSemaphoreExp = urBindlessImagesImportExternalSemaphoreExp; pDdiTable->pfnReleaseExternalSemaphoreExp = diff --git a/source/adapters/native_cpu/usm.cpp b/source/adapters/native_cpu/usm.cpp index 45ac0596f3..2fe0d551a8 100644 --- a/source/adapters/native_cpu/usm.cpp +++ b/source/adapters/native_cpu/usm.cpp @@ -8,90 +8,97 @@ // //===----------------------------------------------------------------------===// +#include "ur/ur.hpp" #include "ur_api.h" #include "common.hpp" +#include "context.hpp" +#include -UR_APIEXPORT ur_result_t UR_APICALL -urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, - ur_usm_pool_handle_t pool, size_t size, void **ppMem) { - std::ignore = hContext; - std::ignore = pUSMDesc; - std::ignore = pool; +namespace umf { +ur_result_t getProviderNativeError(const char *, int32_t) { + return UR_RESULT_ERROR_UNKNOWN; +} +} // namespace umf +static ur_result_t alloc_helper(ur_context_handle_t hContext, + const ur_usm_desc_t *pUSMDesc, size_t size, + void **ppMem, ur_usm_type_t type) { + auto alignment = (pUSMDesc && pUSMDesc->align) ? pUSMDesc->align : 1u; + UR_ASSERT(isPowerOf2(alignment), UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT); UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); // TODO: Check Max size when UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE is implemented UR_ASSERT(size > 0, UR_RESULT_ERROR_INVALID_USM_SIZE); - *ppMem = malloc(size); + auto *ptr = hContext->add_alloc(alignment, type, size, nullptr); + UR_ASSERT(ptr != nullptr, UR_RESULT_ERROR_OUT_OF_RESOURCES); + *ppMem = ptr; return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL +urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, + ur_usm_pool_handle_t pool, size_t size, void **ppMem) { + std::ignore = pool; + + return alloc_helper(hContext, pUSMDesc, size, ppMem, UR_USM_TYPE_HOST); +} + UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool, size_t size, void **ppMem) { - std::ignore = hContext; std::ignore = hDevice; - std::ignore = pUSMDesc; std::ignore = pool; - UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); - // TODO: Check Max size when UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE is implemented - UR_ASSERT(size > 0, UR_RESULT_ERROR_INVALID_USM_SIZE); - - *ppMem = malloc(size); - - return UR_RESULT_SUCCESS; + return alloc_helper(hContext, pUSMDesc, size, ppMem, UR_USM_TYPE_DEVICE); } UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool, size_t size, void **ppMem) { - std::ignore = hContext; std::ignore = hDevice; - std::ignore = pUSMDesc; std::ignore = pool; - UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); - // TODO: Check Max size when UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE is implemented - UR_ASSERT(size > 0, UR_RESULT_ERROR_INVALID_USM_SIZE); - - *ppMem = malloc(size); - - return UR_RESULT_SUCCESS; + return alloc_helper(hContext, pUSMDesc, size, ppMem, UR_USM_TYPE_SHARED); } UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext, void *pMem) { - std::ignore = hContext; UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_POINTER); - free(pMem); + auto res = hContext->remove_alloc(pMem); - return UR_RESULT_SUCCESS; + return res; } UR_APIEXPORT ur_result_t UR_APICALL urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem, ur_usm_alloc_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet) { - std::ignore = hContext; - std::ignore = pMem; - std::ignore = propName; - std::ignore = propSize; - std::ignore = pPropValue; - std::ignore = pPropSizeRet; + UR_ASSERT(pMem != nullptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + if (propName == UR_USM_ALLOC_INFO_BASE_PTR) { + // TODO: logic to compute base ptr given ptr + DIE_NO_IMPLEMENTATION; + } + const native_cpu::usm_alloc_info &alloc_info = + hContext->get_alloc_info_entry(pMem); switch (propName) { case UR_USM_ALLOC_INFO_TYPE: - // Todo implement this in context - return ReturnValue(UR_USM_TYPE_DEVICE); + return ReturnValue(alloc_info.type); + case UR_USM_ALLOC_INFO_SIZE: + return ReturnValue(alloc_info.size); + case UR_USM_ALLOC_INFO_DEVICE: + return ReturnValue(alloc_info.device); + case UR_USM_ALLOC_INFO_POOL: + return ReturnValue(alloc_info.pool); default: DIE_NO_IMPLEMENTATION; } diff --git a/source/adapters/opencl/CMakeLists.txt b/source/adapters/opencl/CMakeLists.txt index 4600163963..c342e28c90 100644 --- a/source/adapters/opencl/CMakeLists.txt +++ b/source/adapters/opencl/CMakeLists.txt @@ -44,6 +44,7 @@ add_ur_adapter(${TARGET_NAME} SHARED ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.hpp ) +install_ur_library(${TARGET_NAME}) set_target_properties(${TARGET_NAME} PROPERTIES VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}" @@ -55,7 +56,7 @@ if(UR_OPENCL_INCLUDE_DIR) else() FetchContent_Declare(OpenCL-Headers GIT_REPOSITORY "https://github.com/KhronosGroup/OpenCL-Headers.git" - GIT_TAG main + GIT_TAG 542d7a8f65ecfd88b38de35d8b10aa67b36b33b2 ) FetchContent_MakeAvailable(OpenCL-Headers) FetchContent_GetProperties(OpenCL-Headers @@ -97,6 +98,7 @@ target_include_directories(${TARGET_NAME} PRIVATE target_link_libraries(${TARGET_NAME} PRIVATE ${PROJECT_NAME}::headers ${PROJECT_NAME}::common + ${PROJECT_NAME}::umf Threads::Threads ${OpenCLICDLoaderLibrary} ) diff --git a/source/adapters/opencl/command_buffer.cpp b/source/adapters/opencl/command_buffer.cpp index 5698f36928..15029d5e27 100644 --- a/source/adapters/opencl/command_buffer.cpp +++ b/source/adapters/opencl/command_buffer.cpp @@ -71,10 +71,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( const bool IsUpdatable = pCommandBufferDesc ? pCommandBufferDesc->isUpdatable : false; - bool DeviceSupportsUpdate = false; + ur_device_command_buffer_update_capability_flags_t UpdateCapabilities; cl_device_id CLDevice = cl_adapter::cast(hDevice); - CL_RETURN_ON_FAILURE(deviceSupportsURCommandBufferKernelUpdate( - CLDevice, DeviceSupportsUpdate)); + CL_RETURN_ON_FAILURE( + getDeviceCommandBufferUpdateCapabilities(CLDevice, UpdateCapabilities)); + bool DeviceSupportsUpdate = UpdateCapabilities > 0; if (IsUpdatable && !DeviceSupportsUpdate) { return UR_RESULT_ERROR_INVALID_OPERATION; @@ -140,10 +141,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t /*numKernelAlternatives*/, + ur_kernel_handle_t * /*phKernelAlternatives*/, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, ur_exp_command_buffer_command_handle_t *phCommandHandle) { + (void)numEventsInWaitList; + (void)phEventWaitList; + (void)phEvent; + + // Command handles can only be obtained from updatable command-buffers + UR_ASSERT(!(phCommandHandle && !hCommandBuffer->IsUpdatable), + UR_RESULT_ERROR_INVALID_OPERATION); cl_context CLContext = cl_adapter::cast(hCommandBuffer->hContext); cl_ext::clCommandNDRangeKernelKHR_fn clCommandNDRangeKernelKHR = nullptr; @@ -156,7 +167,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( cl_mutable_command_khr *OutCommandHandle = hCommandBuffer->IsUpdatable ? &CommandHandle : nullptr; - cl_ndrange_kernel_command_properties_khr UpdateProperties[] = { + cl_command_properties_khr UpdateProperties[] = { CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR, CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR | CL_MUTABLE_DISPATCH_GLOBAL_SIZE_KHR | @@ -164,7 +175,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( CL_MUTABLE_DISPATCH_ARGUMENTS_KHR | CL_MUTABLE_DISPATCH_EXEC_INFO_KHR, 0}; - cl_ndrange_kernel_command_properties_khr *Properties = + cl_command_properties_khr *Properties = hCommandBuffer->IsUpdatable ? UpdateProperties : nullptr; CL_RETURN_ON_FAILURE(clCommandNDRangeKernelKHR( hCommandBuffer->CLCommandBuffer, nullptr, Properties, @@ -175,9 +186,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( try { auto URCommandHandle = std::make_unique( - hCommandBuffer, CommandHandle, workDim, pLocalWorkSize != nullptr); - *phCommandHandle = URCommandHandle.release(); - hCommandBuffer->CommandHandles.push_back(*phCommandHandle); + hCommandBuffer, CommandHandle, hKernel, workDim, + pLocalWorkSize != nullptr); + ur_exp_command_buffer_command_handle_t Handle = URCommandHandle.release(); + hCommandBuffer->CommandHandles.push_back(Handle); + if (phCommandHandle) { + *phCommandHandle = Handle; + } } catch (...) { return UR_RESULT_ERROR_OUT_OF_RESOURCES; } @@ -192,7 +207,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( [[maybe_unused]] uint32_t numSyncPointsInWaitList, [[maybe_unused]] const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - [[maybe_unused]] ur_exp_command_buffer_sync_point_t *pSyncPoint) { + [[maybe_unused]] uint32_t numEventsInWaitList, + [[maybe_unused]] const ur_event_handle_t *phEventWaitList, + [[maybe_unused]] ur_exp_command_buffer_sync_point_t *pSyncPoint, + [[maybe_unused]] ur_event_handle_t *phEvent, + [[maybe_unused]] ur_exp_command_buffer_command_handle_t *phCommand) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -203,7 +222,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( [[maybe_unused]] uint32_t numSyncPointsInWaitList, [[maybe_unused]] const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - [[maybe_unused]] ur_exp_command_buffer_sync_point_t *pSyncPoint) { + [[maybe_unused]] uint32_t numEventsInWaitList, + [[maybe_unused]] const ur_event_handle_t *phEventWaitList, + [[maybe_unused]] ur_exp_command_buffer_sync_point_t *pSyncPoint, + [[maybe_unused]] ur_event_handle_t *phEvent, + [[maybe_unused]] ur_exp_command_buffer_command_handle_t *phCommand) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -212,8 +235,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( ur_mem_handle_t hDstMem, size_t srcOffset, size_t dstOffset, size_t size, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { + (void)numEventsInWaitList; + (void)phEventWaitList; + (void)phEvent; + (void)phCommand; cl_context CLContext = cl_adapter::cast(hCommandBuffer->hContext); cl_ext::clCommandCopyBufferKHR_fn clCommandCopyBufferKHR = nullptr; UR_RETURN_ON_FAILURE( @@ -222,7 +250,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( cl_ext::CommandCopyBufferName, &clCommandCopyBufferKHR)); CL_RETURN_ON_FAILURE(clCommandCopyBufferKHR( - hCommandBuffer->CLCommandBuffer, nullptr, + hCommandBuffer->CLCommandBuffer, nullptr, nullptr, cl_adapter::cast(hSrcMem), cl_adapter::cast(hDstMem), srcOffset, dstOffset, size, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint, nullptr)); @@ -242,7 +270,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( [[maybe_unused]] uint32_t numSyncPointsInWaitList, [[maybe_unused]] const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - [[maybe_unused]] ur_exp_command_buffer_sync_point_t *pSyncPoint) { + [[maybe_unused]] uint32_t numEventsInWaitList, + [[maybe_unused]] const ur_event_handle_t *phEventWaitList, + [[maybe_unused]] ur_exp_command_buffer_sync_point_t *pSyncPoint, + [[maybe_unused]] ur_event_handle_t *phEvent, + [[maybe_unused]] ur_exp_command_buffer_command_handle_t *phCommand) { size_t OpenCLOriginRect[3]{srcOrigin.x, srcOrigin.y, srcOrigin.z}; size_t OpenCLDstRect[3]{dstOrigin.x, dstOrigin.y, dstOrigin.z}; @@ -256,7 +288,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( cl_ext::CommandCopyBufferRectName, &clCommandCopyBufferRectKHR)); CL_RETURN_ON_FAILURE(clCommandCopyBufferRectKHR( - hCommandBuffer->CLCommandBuffer, nullptr, + hCommandBuffer->CLCommandBuffer, nullptr, nullptr, cl_adapter::cast(hSrcMem), cl_adapter::cast(hDstMem), OpenCLOriginRect, OpenCLDstRect, OpenCLRegion, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, numSyncPointsInWaitList, pSyncPointWaitList, @@ -273,10 +305,11 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( [[maybe_unused]] uint32_t numSyncPointsInWaitList, [[maybe_unused]] const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - [[maybe_unused]] ur_exp_command_buffer_sync_point_t *pSyncPoint) { - - cl_adapter::die("Experimental Command-buffer feature is not " - "implemented for OpenCL adapter."); + [[maybe_unused]] uint32_t numEventsInWaitList, + [[maybe_unused]] const ur_event_handle_t *phEventWaitList, + [[maybe_unused]] ur_exp_command_buffer_sync_point_t *pSyncPoint, + [[maybe_unused]] ur_event_handle_t *phEvent, + [[maybe_unused]] ur_exp_command_buffer_command_handle_t *phCommand) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -288,7 +321,11 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( [[maybe_unused]] uint32_t numSyncPointsInWaitList, [[maybe_unused]] const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - [[maybe_unused]] ur_exp_command_buffer_sync_point_t *pSyncPoint) { + [[maybe_unused]] uint32_t numEventsInWaitList, + [[maybe_unused]] const ur_event_handle_t *phEventWaitList, + [[maybe_unused]] ur_exp_command_buffer_sync_point_t *pSyncPoint, + [[maybe_unused]] ur_event_handle_t *phEvent, + [[maybe_unused]] ur_exp_command_buffer_command_handle_t *phCommand) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -306,7 +343,11 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( [[maybe_unused]] uint32_t numSyncPointsInWaitList, [[maybe_unused]] const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - [[maybe_unused]] ur_exp_command_buffer_sync_point_t *pSyncPoint) { + [[maybe_unused]] uint32_t numEventsInWaitList, + [[maybe_unused]] const ur_event_handle_t *phEventWaitList, + [[maybe_unused]] ur_exp_command_buffer_sync_point_t *pSyncPoint, + [[maybe_unused]] ur_event_handle_t *phEvent, + [[maybe_unused]] ur_exp_command_buffer_command_handle_t *phCommand) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -324,7 +365,11 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( [[maybe_unused]] uint32_t numSyncPointsInWaitList, [[maybe_unused]] const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - [[maybe_unused]] ur_exp_command_buffer_sync_point_t *pSyncPoint) { + [[maybe_unused]] uint32_t numEventsInWaitList, + [[maybe_unused]] const ur_event_handle_t *phEventWaitList, + [[maybe_unused]] ur_exp_command_buffer_sync_point_t *pSyncPoint, + [[maybe_unused]] ur_event_handle_t *phEvent, + [[maybe_unused]] ur_exp_command_buffer_command_handle_t *phCommand) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -333,7 +378,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( const void *pPattern, size_t patternSize, size_t offset, size_t size, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { + [[maybe_unused]] uint32_t numEventsInWaitList, + [[maybe_unused]] const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, + [[maybe_unused]] ur_event_handle_t *phEvent, + [[maybe_unused]] ur_exp_command_buffer_command_handle_t *phCommand) { cl_context CLContext = cl_adapter::cast(hCommandBuffer->hContext); cl_ext::clCommandFillBufferKHR_fn clCommandFillBufferKHR = nullptr; @@ -343,7 +392,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( cl_ext::CommandFillBufferName, &clCommandFillBufferKHR)); CL_RETURN_ON_FAILURE(clCommandFillBufferKHR( - hCommandBuffer->CLCommandBuffer, nullptr, + hCommandBuffer->CLCommandBuffer, nullptr, nullptr, cl_adapter::cast(hBuffer), pPattern, patternSize, offset, size, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint, nullptr)); @@ -354,14 +403,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( ur_exp_command_buffer_handle_t hCommandBuffer, const void *mem, size_t size, ur_usm_migration_flags_t flags, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { (void)hCommandBuffer; (void)mem; (void)size; (void)flags; (void)numSyncPointsInWaitList; (void)pSyncPointWaitList; + (void)numEventsInWaitList; + (void)phEventWaitList; (void)pSyncPoint; + (void)phEvent; + (void)phCommand; // Not implemented return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; @@ -371,14 +426,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( ur_exp_command_buffer_handle_t hCommandBuffer, const void *mem, size_t size, ur_usm_advice_flags_t advice, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) { (void)hCommandBuffer; (void)mem; (void)size; (void)advice; (void)numSyncPointsInWaitList; (void)pSyncPointWaitList; + (void)numEventsInWaitList; + (void)phEventWaitList; (void)pSyncPoint; + (void)phEvent; + (void)phCommand; // Not implemented return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; @@ -485,6 +546,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( const ur_exp_command_buffer_update_kernel_launch_desc_t *pUpdateKernelLaunch) { + // Kernel handle updates are not yet supported. + if (pUpdateKernelLaunch->hNewKernel != hCommand->Kernel) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + ur_exp_command_buffer_handle_t hCommandBuffer = hCommand->hCommandBuffer; cl_context CLContext = cl_adapter::cast(hCommandBuffer->hContext); @@ -497,27 +563,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( if (!hCommandBuffer->IsFinalized || !hCommandBuffer->IsUpdatable) return UR_RESULT_ERROR_INVALID_OPERATION; - if (cl_uint NewWorkDim = pUpdateKernelLaunch->newWorkDim) { - // Error if work dim changes - if (NewWorkDim != hCommand->WorkDim) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } - - // Error If Local size and not global size - if ((pUpdateKernelLaunch->pNewLocalWorkSize != nullptr) && - (pUpdateKernelLaunch->pNewGlobalWorkSize == nullptr)) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } - - // Error if local size non-nullptr and created with null - // or if local size nullptr and created with non-null - const bool IsNewLocalSizeNull = - pUpdateKernelLaunch->pNewLocalWorkSize == nullptr; - const bool IsOriginalLocalSizeNull = !hCommand->UserDefinedLocalSize; - - if (IsNewLocalSizeNull ^ IsOriginalLocalSizeNull) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } + if (pUpdateKernelLaunch->newWorkDim != hCommand->WorkDim && + (!pUpdateKernelLaunch->pNewGlobalWorkOffset || + !pUpdateKernelLaunch->pNewGlobalWorkSize)) { + return UR_RESULT_ERROR_INVALID_OPERATION; } // Find the CL USM pointer arguments to the kernel to update @@ -556,8 +605,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( cl_mutable_command_khr command = cl_adapter::cast(hCommand->CLMutableCommand); cl_mutable_dispatch_config_khr dispatch_config = { - CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR, - nullptr, command, static_cast(CLArgs.size()), // num_args static_cast(CLUSMArgs.size()), // num_svm_args @@ -570,14 +617,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( CLGlobalWorkSize.data(), // global_work_size CLLocalWorkSize.data(), // local_work_size }; - cl_mutable_base_config_khr config = { - CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR, nullptr, 1, &dispatch_config}; - CL_RETURN_ON_FAILURE( - clUpdateMutableCommandsKHR(hCommandBuffer->CLCommandBuffer, &config)); + cl_uint num_configs = 1; + cl_command_buffer_update_type_khr config_types[1] = { + CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR}; + const void *configs[1] = {&dispatch_config}; + CL_RETURN_ON_FAILURE(clUpdateMutableCommandsKHR( + hCommandBuffer->CLCommandBuffer, num_configs, config_types, configs)); return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateSignalEventExp( + [[maybe_unused]] ur_exp_command_buffer_command_handle_t Command, + [[maybe_unused]] ur_event_handle_t *Event) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateWaitEventsExp( + [[maybe_unused]] ur_exp_command_buffer_command_handle_t Command, + [[maybe_unused]] uint32_t NumEventsInWaitList, + [[maybe_unused]] const ur_event_handle_t *EventWaitList) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferGetInfoExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_exp_command_buffer_info_t propName, size_t propSize, void *pPropValue, @@ -588,6 +650,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferGetInfoExp( switch (propName) { case UR_EXP_COMMAND_BUFFER_INFO_REFERENCE_COUNT: return ReturnValue(hCommandBuffer->getExternalReferenceCount()); + case UR_EXP_COMMAND_BUFFER_INFO_DESCRIPTOR: { + ur_exp_command_buffer_desc_t Descriptor{}; + Descriptor.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC; + Descriptor.pNext = nullptr; + Descriptor.isUpdatable = hCommandBuffer->IsUpdatable; + Descriptor.isInOrder = false; + Descriptor.enableProfiling = false; + + return ReturnValue(Descriptor); + } default: assert(!"Command-buffer info request not implemented"); } diff --git a/source/adapters/opencl/command_buffer.hpp b/source/adapters/opencl/command_buffer.hpp index 4c39b1ad74..d8e975a3df 100644 --- a/source/adapters/opencl/command_buffer.hpp +++ b/source/adapters/opencl/command_buffer.hpp @@ -17,6 +17,8 @@ struct ur_exp_command_buffer_command_handle_t_ { ur_exp_command_buffer_handle_t hCommandBuffer; /// OpenCL command-handle. cl_mutable_command_khr CLMutableCommand; + /// Kernel associated with this command handle + ur_kernel_handle_t Kernel; /// Work-dimension the command was originally created with. cl_uint WorkDim; /// Set to true if the user set the local work size on command creation. @@ -31,11 +33,12 @@ struct ur_exp_command_buffer_command_handle_t_ { ur_exp_command_buffer_command_handle_t_( ur_exp_command_buffer_handle_t hCommandBuffer, - cl_mutable_command_khr CLMutableCommand, cl_uint WorkDim, - bool UserDefinedLocalSize) + cl_mutable_command_khr CLMutableCommand, ur_kernel_handle_t Kernel, + cl_uint WorkDim, bool UserDefinedLocalSize) : hCommandBuffer(hCommandBuffer), CLMutableCommand(CLMutableCommand), - WorkDim(WorkDim), UserDefinedLocalSize(UserDefinedLocalSize), - RefCountInternal(0), RefCountExternal(0) {} + Kernel(Kernel), WorkDim(WorkDim), + UserDefinedLocalSize(UserDefinedLocalSize), RefCountInternal(0), + RefCountExternal(0) {} uint32_t incrementInternalReferenceCount() noexcept { return ++RefCountInternal; diff --git a/source/adapters/opencl/common.cpp b/source/adapters/opencl/common.cpp index 03775fb87d..d6e934c68b 100644 --- a/source/adapters/opencl/common.cpp +++ b/source/adapters/opencl/common.cpp @@ -12,7 +12,7 @@ #include "logger/ur_logger.hpp" namespace cl_adapter { -/* Global variables for urPlatformGetLastError() */ +/* Global variables for urAdapterGetLastError() */ thread_local int32_t ErrorMessageCode = 0; thread_local char ErrorMessage[MaxMessageSize]; @@ -95,6 +95,8 @@ ur_result_t mapCLErrorToUR(cl_int Result) { return UR_RESULT_ERROR_INVALID_QUEUE; case CL_INVALID_ARG_SIZE: return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE; + case CL_INVALID_SPEC_ID: + return UR_RESULT_ERROR_INVALID_SPEC_ID; default: return UR_RESULT_ERROR_UNKNOWN; } @@ -116,8 +118,12 @@ ur_result_t getNativeHandle(void *URObj, ur_native_handle_t *NativeHandle) { return UR_RESULT_SUCCESS; } -cl_int deviceSupportsURCommandBufferKernelUpdate(cl_device_id Dev, - bool &Result) { +cl_int getDeviceCommandBufferUpdateCapabilities( + cl_device_id Dev, + ur_device_command_buffer_update_capability_flags_t &UpdateCapabilities) { + + UpdateCapabilities = 0; + size_t ExtSize = 0; CL_RETURN_ON_FAILURE( clGetDeviceInfo(Dev, CL_DEVICE_EXTENSIONS, 0, nullptr, &ExtSize)); @@ -129,21 +135,34 @@ cl_int deviceSupportsURCommandBufferKernelUpdate(cl_device_id Dev, std::string SupportedExtensions(ExtStr.c_str()); if (ExtStr.find("cl_khr_command_buffer_mutable_dispatch") == std::string::npos) { - Result = false; return CL_SUCCESS; } - // All the CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR capabilities must - // be supported by a device for UR update. - cl_mutable_dispatch_fields_khr mutable_capabilities; + cl_mutable_dispatch_fields_khr MutableCapabilities; CL_RETURN_ON_FAILURE(clGetDeviceInfo( Dev, CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR, - sizeof(mutable_capabilities), &mutable_capabilities, nullptr)); - const cl_mutable_dispatch_fields_khr required_caps = - CL_MUTABLE_DISPATCH_ARGUMENTS_KHR | - CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR | - CL_MUTABLE_DISPATCH_GLOBAL_SIZE_KHR | CL_MUTABLE_DISPATCH_LOCAL_SIZE_KHR | - CL_MUTABLE_DISPATCH_EXEC_INFO_KHR; - Result = (mutable_capabilities & required_caps) == required_caps; + sizeof(MutableCapabilities), &MutableCapabilities, nullptr)); + + if (!(MutableCapabilities & CL_MUTABLE_DISPATCH_EXEC_INFO_KHR)) { + return CL_SUCCESS; + } + + if (MutableCapabilities & CL_MUTABLE_DISPATCH_ARGUMENTS_KHR) { + UpdateCapabilities |= + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS; + } + if (MutableCapabilities & CL_MUTABLE_DISPATCH_GLOBAL_SIZE_KHR) { + UpdateCapabilities |= + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE; + } + if (MutableCapabilities & CL_MUTABLE_DISPATCH_LOCAL_SIZE_KHR) { + UpdateCapabilities |= + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE; + } + if (MutableCapabilities & CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR) { + UpdateCapabilities |= + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET; + } + return CL_SUCCESS; } diff --git a/source/adapters/opencl/common.hpp b/source/adapters/opencl/common.hpp index 399f668077..de8b766272 100644 --- a/source/adapters/opencl/common.hpp +++ b/source/adapters/opencl/common.hpp @@ -272,8 +272,8 @@ cl_int(CL_API_CALL *)(cl_command_buffer_khr command_buffer); using clCommandNDRangeKernelKHR_fn = CL_API_ENTRY cl_int(CL_API_CALL *)( cl_command_buffer_khr command_buffer, cl_command_queue command_queue, - const cl_ndrange_kernel_command_properties_khr *properties, - cl_kernel kernel, cl_uint work_dim, const size_t *global_work_offset, + const cl_command_properties_khr *properties, cl_kernel kernel, + cl_uint work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, cl_uint num_sync_points_in_wait_list, const cl_sync_point_khr *sync_point_wait_list, @@ -281,24 +281,27 @@ using clCommandNDRangeKernelKHR_fn = CL_API_ENTRY cl_int(CL_API_CALL *)( using clCommandCopyBufferKHR_fn = CL_API_ENTRY cl_int(CL_API_CALL *)( cl_command_buffer_khr command_buffer, cl_command_queue command_queue, - cl_mem src_buffer, cl_mem dst_buffer, size_t src_offset, size_t dst_offset, - size_t size, cl_uint num_sync_points_in_wait_list, + const cl_command_properties_khr *properties, cl_mem src_buffer, + cl_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, + cl_uint num_sync_points_in_wait_list, const cl_sync_point_khr *sync_point_wait_list, cl_sync_point_khr *sync_point, cl_mutable_command_khr *mutable_handle); using clCommandCopyBufferRectKHR_fn = CL_API_ENTRY cl_int(CL_API_CALL *)( cl_command_buffer_khr command_buffer, cl_command_queue command_queue, - cl_mem src_buffer, cl_mem dst_buffer, const size_t *src_origin, - const size_t *dst_origin, const size_t *region, size_t src_row_pitch, - size_t src_slice_pitch, size_t dst_row_pitch, size_t dst_slice_pitch, + const cl_command_properties_khr *properties, cl_mem src_buffer, + cl_mem dst_buffer, const size_t *src_origin, const size_t *dst_origin, + const size_t *region, size_t src_row_pitch, size_t src_slice_pitch, + size_t dst_row_pitch, size_t dst_slice_pitch, cl_uint num_sync_points_in_wait_list, const cl_sync_point_khr *sync_point_wait_list, cl_sync_point_khr *sync_point, cl_mutable_command_khr *mutable_handle); using clCommandFillBufferKHR_fn = CL_API_ENTRY cl_int(CL_API_CALL *)( cl_command_buffer_khr command_buffer, cl_command_queue command_queue, - cl_mem buffer, const void *pattern, size_t pattern_size, size_t offset, - size_t size, cl_uint num_sync_points_in_wait_list, + const cl_command_properties_khr *properties, cl_mem buffer, + const void *pattern, size_t pattern_size, size_t offset, size_t size, + cl_uint num_sync_points_in_wait_list, const cl_sync_point_khr *sync_point_wait_list, cl_sync_point_khr *sync_point, cl_mutable_command_khr *mutable_handle); @@ -313,51 +316,40 @@ using clGetCommandBufferInfoKHR_fn = CL_API_ENTRY cl_int(CL_API_CALL *)( size_t param_value_size, void *param_value, size_t *param_value_size_ret); using clUpdateMutableCommandsKHR_fn = CL_API_ENTRY -cl_int(CL_API_CALL *)(cl_command_buffer_khr command_buffer, - const cl_mutable_base_config_khr *mutable_config); +cl_int(CL_API_CALL *)(cl_command_buffer_khr command_buffer, cl_uint num_configs, + const cl_command_buffer_update_type_khr *config_types, + const void **configs); template struct FuncPtrCache { std::map Map; std::mutex Mutex; + + void clear(cl_context context) { + std::lock_guard CacheLock{Mutex}; + Map.erase(context); + } }; -// FIXME: There's currently no mechanism for cleaning up this cache, meaning -// that it is invalidated whenever a context is destroyed. This could lead to -// reusing an invalid function pointer if another context happens to have the -// same native handle. struct ExtFuncPtrCacheT { - FuncPtrCache clHostMemAllocINTELCache; - FuncPtrCache clDeviceMemAllocINTELCache; - FuncPtrCache clSharedMemAllocINTELCache; - FuncPtrCache clGetDeviceFunctionPointerCache; - FuncPtrCache - clGetDeviceGlobalVariablePointerCache; - FuncPtrCache - clCreateBufferWithPropertiesINTELCache; - FuncPtrCache clMemBlockingFreeINTELCache; - FuncPtrCache - clSetKernelArgMemPointerINTELCache; - FuncPtrCache clEnqueueMemFillINTELCache; - FuncPtrCache clEnqueueMemcpyINTELCache; - FuncPtrCache clGetMemAllocInfoINTELCache; - FuncPtrCache - clEnqueueWriteGlobalVariableCache; - FuncPtrCache clEnqueueReadGlobalVariableCache; - FuncPtrCache clEnqueueReadHostPipeINTELCache; - FuncPtrCache clEnqueueWriteHostPipeINTELCache; - FuncPtrCache - clSetProgramSpecializationConstantCache; - FuncPtrCache clCreateCommandBufferKHRCache; - FuncPtrCache clRetainCommandBufferKHRCache; - FuncPtrCache clReleaseCommandBufferKHRCache; - FuncPtrCache clFinalizeCommandBufferKHRCache; - FuncPtrCache clCommandNDRangeKernelKHRCache; - FuncPtrCache clCommandCopyBufferKHRCache; - FuncPtrCache clCommandCopyBufferRectKHRCache; - FuncPtrCache clCommandFillBufferKHRCache; - FuncPtrCache clEnqueueCommandBufferKHRCache; - FuncPtrCache clGetCommandBufferInfoKHRCache; - FuncPtrCache clUpdateMutableCommandsKHRCache; +#define CL_EXTENSION_FUNC(func) FuncPtrCache func##Cache; + +#include "extension_functions.def" + +#undef CL_EXTENSION_FUNC + + // If a context stored in the current caching mechanism is destroyed by the + // CL driver all of its function pointers are invalidated. This can lead to a + // pathological case where a subsequently created context gets returned with + // a coincidentally identical handle to the destroyed one and ends up being + // used to retrieve bad function pointers. To avoid this we clear the cache + // when contexts are released. + void clearCache(cl_context context) { +#define CL_EXTENSION_FUNC(func) func##Cache.clear(context); + +#include "extension_functions.def" + +#undef CL_EXTENSION_FUNC + } }; // A raw pointer is used here since the lifetime of this map has to be tied to // piTeardown to avoid issues with static destruction order (a user application @@ -429,5 +421,6 @@ ur_result_t mapCLErrorToUR(cl_int Result); ur_result_t getNativeHandle(void *URObj, ur_native_handle_t *NativeHandle); -cl_int deviceSupportsURCommandBufferKernelUpdate(cl_device_id Dev, - bool &Result); +cl_int getDeviceCommandBufferUpdateCapabilities( + cl_device_id Dev, + ur_device_command_buffer_update_capability_flags_t &UpdateCapabilities); diff --git a/source/adapters/opencl/context.cpp b/source/adapters/opencl/context.cpp index 1478050cda..38202bbf58 100644 --- a/source/adapters/opencl/context.cpp +++ b/source/adapters/opencl/context.cpp @@ -113,9 +113,30 @@ urContextGetInfo(ur_context_handle_t hContext, ur_context_info_t propName, UR_APIEXPORT ur_result_t UR_APICALL urContextRelease(ur_context_handle_t hContext) { + // If we're reasonably sure this context is about to be detroyed we should + // clear the ext function pointer cache. This isn't foolproof sadly but it + // should drastically reduce the chances of the pathological case described + // in the comments in common.hpp. + static std::mutex contextReleaseMutex; + auto clContext = cl_adapter::cast(hContext); - cl_int Ret = clReleaseContext(cl_adapter::cast(hContext)); - return mapCLErrorToUR(Ret); + { + std::lock_guard lock(contextReleaseMutex); + size_t refCount = 0; + CL_RETURN_ON_FAILURE(clGetContextInfo(clContext, CL_CONTEXT_REFERENCE_COUNT, + sizeof(size_t), &refCount, nullptr)); + + // ExtFuncPtrCache is destroyed in an atexit() callback, so it doesn't + // necessarily outlive the adapter (or all the contexts). + if (refCount == 1 && cl_ext::ExtFuncPtrCache) { + cl_ext::ExtFuncPtrCache->clearCache(clContext); + } + } + + CL_RETURN_ON_FAILURE( + clReleaseContext(cl_adapter::cast(hContext))); + + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL diff --git a/source/adapters/opencl/device.cpp b/source/adapters/opencl/device.cpp index 44262df26a..b9cd7e38fc 100644 --- a/source/adapters/opencl/device.cpp +++ b/source/adapters/opencl/device.cpp @@ -32,6 +32,17 @@ ur_result_t cl_adapter::getDeviceVersion(cl_device_id Dev, return UR_RESULT_SUCCESS; } +static bool isIntelFPGAEmuDevice(cl_device_id Dev) { + size_t NameSize = 0; + CL_RETURN_ON_FAILURE( + clGetDeviceInfo(Dev, CL_DEVICE_NAME, 0, nullptr, &NameSize)); + std::string NameStr(NameSize, '\0'); + CL_RETURN_ON_FAILURE( + clGetDeviceInfo(Dev, CL_DEVICE_NAME, NameSize, NameStr.data(), nullptr)); + + return NameStr.find("Intel(R) FPGA Emulation Device") != std::string::npos; +} + ur_result_t cl_adapter::checkDeviceExtensions( cl_device_id Dev, const std::vector &Exts, bool &Supported) { size_t ExtSize = 0; @@ -46,6 +57,14 @@ ur_result_t cl_adapter::checkDeviceExtensions( Supported = true; for (const std::string &Ext : Exts) { if (!(Supported = (ExtStr.find(Ext) != std::string::npos))) { + // The Intel FPGA emulation device does actually support these, even if it + // doesn't report them. + if (isIntelFPGAEmuDevice(Dev) && + (Ext == "cl_intel_device_attribute_query" || + Ext == "cl_intel_required_subgroup_size")) { + Supported = true; + continue; + } break; } } @@ -431,15 +450,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, URValue[i].type = static_cast(CLValue[0]); switch (URValue[i].type) { case UR_DEVICE_PARTITION_EQUALLY: { - URValue[i].value.equally = CLValue[i + 1]; + URValue[i].value.equally = static_cast(CLValue[i + 1]); break; } case UR_DEVICE_PARTITION_BY_COUNTS: { - URValue[i].value.count = CLValue[i + 1]; + URValue[i].value.count = static_cast(CLValue[i + 1]); break; } case UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN: { - URValue[i].value.affinity_domain = CLValue[i + 1]; + URValue[i].value.affinity_domain = + static_cast(CLValue[i + 1]); break; } default: { @@ -825,12 +845,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: case UR_DEVICE_INFO_LOCAL_MEM_TYPE: case UR_DEVICE_INFO_EXECUTION_CAPABILITIES: - case UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: - case UR_DEVICE_INFO_USM_HOST_SUPPORT: - case UR_DEVICE_INFO_USM_DEVICE_SUPPORT: - case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: - case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: - case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: { + case UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: { /* CL type: cl_bitfield / enum * UR type: ur_flags_t (uint32_t) */ @@ -844,6 +859,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, * types are uint32_t */ return ReturnValue(static_cast(CLValue)); } + case UR_DEVICE_INFO_USM_HOST_SUPPORT: + case UR_DEVICE_INFO_USM_DEVICE_SUPPORT: + case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: + case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: + case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: { + /* CL type: cl_bitfield / enum + * UR type: ur_flags_t (uint32_t) */ + bool Supported = false; + UR_RETURN_ON_FAILURE(cl_adapter::checkDeviceExtensions( + cl_adapter::cast(hDevice), + {"cl_intel_unified_shared_memory"}, Supported)); + if (Supported) { + cl_bitfield CLValue = 0; + CL_RETURN_ON_FAILURE( + clGetDeviceInfo(cl_adapter::cast(hDevice), CLPropName, + sizeof(cl_bitfield), &CLValue, nullptr)); + return ReturnValue(static_cast(CLValue)); + } else { + return ReturnValue(0); + } + } case UR_DEVICE_INFO_IMAGE_SUPPORTED: case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: @@ -918,8 +954,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_VERSION: case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION: case UR_DEVICE_INFO_BUILT_IN_KERNELS: - case UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES: - case UR_DEVICE_INFO_IP_VERSION: { + case UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES: { /* We can just use the OpenCL outputs because the sizes of OpenCL types * are the same as UR. * | CL | UR | Size | @@ -937,7 +972,33 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return UR_RESULT_SUCCESS; } + case UR_DEVICE_INFO_IP_VERSION: { + bool Supported; + UR_RETURN_ON_FAILURE(cl_adapter::checkDeviceExtensions( + cl_adapter::cast(hDevice), + {"cl_intel_device_attribute_query"}, Supported)); + if (!Supported) { + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + } + CL_RETURN_ON_FAILURE( + clGetDeviceInfo(cl_adapter::cast(hDevice), CLPropName, + propSize, pPropValue, pPropSizeRet)); + + return UR_RESULT_SUCCESS; + } + case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { + bool isExtensionSupported; + if (cl_adapter::checkDeviceExtensions( + cl_adapter::cast(hDevice), + {"cl_intel_required_subgroup_size"}, + isExtensionSupported) != UR_RESULT_SUCCESS || + !isExtensionSupported) { + std::vector aThreadIsItsOwnSubGroup({1}); + return ReturnValue(aThreadIsItsOwnSubGroup.data(), + aThreadIsItsOwnSubGroup.size()); + } + // Have to convert size_t to uint32_t size_t SubGroupSizesSize = 0; CL_RETURN_ON_FAILURE( @@ -1024,13 +1085,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(ExtStr.find("cl_khr_command_buffer") != std::string::npos); } - case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: { + case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP: { cl_device_id Dev = cl_adapter::cast(hDevice); - bool Supported = false; + ur_device_command_buffer_update_capability_flags_t UpdateCapabilities = 0; CL_RETURN_ON_FAILURE( - deviceSupportsURCommandBufferKernelUpdate(Dev, Supported)); - return ReturnValue(Supported); + getDeviceCommandBufferUpdateCapabilities(Dev, UpdateCapabilities)); + return ReturnValue(UpdateCapabilities); } + case UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP: + return ReturnValue(false); default: { return UR_RESULT_ERROR_INVALID_ENUMERATION; } @@ -1125,7 +1188,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( } UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( - ur_native_handle_t hNativeDevice, ur_platform_handle_t, + ur_native_handle_t hNativeDevice, ur_adapter_handle_t, const ur_device_native_properties_t *, ur_device_handle_t *phDevice) { *phDevice = reinterpret_cast(hNativeDevice); diff --git a/source/adapters/opencl/extension_functions.def b/source/adapters/opencl/extension_functions.def new file mode 100644 index 0000000000..76771744b2 --- /dev/null +++ b/source/adapters/opencl/extension_functions.def @@ -0,0 +1,27 @@ +CL_EXTENSION_FUNC(clHostMemAllocINTEL) +CL_EXTENSION_FUNC(clDeviceMemAllocINTEL) +CL_EXTENSION_FUNC(clSharedMemAllocINTEL) +CL_EXTENSION_FUNC(clGetDeviceFunctionPointer) +CL_EXTENSION_FUNC(clGetDeviceGlobalVariablePointer) +CL_EXTENSION_FUNC(clCreateBufferWithPropertiesINTEL) +CL_EXTENSION_FUNC(clMemBlockingFreeINTEL) +CL_EXTENSION_FUNC(clSetKernelArgMemPointerINTEL) +CL_EXTENSION_FUNC(clEnqueueMemFillINTEL) +CL_EXTENSION_FUNC(clEnqueueMemcpyINTEL) +CL_EXTENSION_FUNC(clGetMemAllocInfoINTEL) +CL_EXTENSION_FUNC(clEnqueueWriteGlobalVariable) +CL_EXTENSION_FUNC(clEnqueueReadGlobalVariable) +CL_EXTENSION_FUNC(clEnqueueReadHostPipeINTEL) +CL_EXTENSION_FUNC(clEnqueueWriteHostPipeINTEL) +CL_EXTENSION_FUNC(clSetProgramSpecializationConstant) +CL_EXTENSION_FUNC(clCreateCommandBufferKHR) +CL_EXTENSION_FUNC(clRetainCommandBufferKHR) +CL_EXTENSION_FUNC(clReleaseCommandBufferKHR) +CL_EXTENSION_FUNC(clFinalizeCommandBufferKHR) +CL_EXTENSION_FUNC(clCommandNDRangeKernelKHR) +CL_EXTENSION_FUNC(clCommandCopyBufferKHR) +CL_EXTENSION_FUNC(clCommandCopyBufferRectKHR) +CL_EXTENSION_FUNC(clCommandFillBufferKHR) +CL_EXTENSION_FUNC(clEnqueueCommandBufferKHR) +CL_EXTENSION_FUNC(clGetCommandBufferInfoKHR) +CL_EXTENSION_FUNC(clUpdateMutableCommandsKHR) diff --git a/source/adapters/opencl/image.cpp b/source/adapters/opencl/image.cpp index 80b43255d5..0c628594bb 100644 --- a/source/adapters/opencl/image.cpp +++ b/source/adapters/opencl/image.cpp @@ -117,8 +117,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( [[maybe_unused]] ur_context_handle_t hContext, [[maybe_unused]] ur_device_handle_t hDevice, [[maybe_unused]] size_t size, [[maybe_unused]] ur_exp_external_mem_type_t memHandleType, - [[maybe_unused]] ur_exp_interop_mem_desc_t *pInteropMemDesc, - [[maybe_unused]] ur_exp_interop_mem_handle_t *phInteropMem) { + [[maybe_unused]] ur_exp_external_mem_desc_t *pExternalMemDesc, + [[maybe_unused]] ur_exp_external_mem_handle_t *phExternalMem) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -127,15 +127,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( [[maybe_unused]] ur_device_handle_t hDevice, [[maybe_unused]] const ur_image_format_t *pImageFormat, [[maybe_unused]] const ur_image_desc_t *pImageDesc, - [[maybe_unused]] ur_exp_interop_mem_handle_t hInteropMem, + [[maybe_unused]] ur_exp_external_mem_handle_t hExternalMem, [[maybe_unused]] ur_exp_image_mem_native_handle_t *phImageMem) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp( [[maybe_unused]] ur_context_handle_t hContext, [[maybe_unused]] ur_device_handle_t hDevice, - [[maybe_unused]] ur_exp_interop_mem_handle_t hInteropMem) { + [[maybe_unused]] uint64_t offset, [[maybe_unused]] uint64_t size, + [[maybe_unused]] ur_exp_external_mem_handle_t hExternalMem, + [[maybe_unused]] void **phRetMem) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp( + [[maybe_unused]] ur_context_handle_t hContext, + [[maybe_unused]] ur_device_handle_t hDevice, + [[maybe_unused]] ur_exp_external_mem_handle_t hExternalMem) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -143,21 +152,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( [[maybe_unused]] ur_context_handle_t hContext, [[maybe_unused]] ur_device_handle_t hDevice, [[maybe_unused]] ur_exp_external_semaphore_type_t semHandleType, - [[maybe_unused]] ur_exp_interop_semaphore_desc_t *pInteropSemaphoreDesc, - [[maybe_unused]] ur_exp_interop_semaphore_handle_t *phInteropSemaphore) { + [[maybe_unused]] ur_exp_external_semaphore_desc_t *pExternalSemaphoreDesc, + [[maybe_unused]] ur_exp_external_semaphore_handle_t + *phExternalSemaphoreHandle) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( [[maybe_unused]] ur_context_handle_t hContext, [[maybe_unused]] ur_device_handle_t hDevice, - [[maybe_unused]] ur_exp_interop_semaphore_handle_t hInteropSemaphore) { + [[maybe_unused]] ur_exp_external_semaphore_handle_t hExternalSemaphore) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( [[maybe_unused]] ur_queue_handle_t hQueue, - [[maybe_unused]] ur_exp_interop_semaphore_handle_t hSemaphore, + [[maybe_unused]] ur_exp_external_semaphore_handle_t hSemaphore, [[maybe_unused]] bool hasValue, [[maybe_unused]] uint64_t waitValue, [[maybe_unused]] uint32_t numEventsInWaitList, [[maybe_unused]] const ur_event_handle_t *phEventWaitList, @@ -167,7 +177,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( [[maybe_unused]] ur_queue_handle_t hQueue, - [[maybe_unused]] ur_exp_interop_semaphore_handle_t hSemaphore, + [[maybe_unused]] ur_exp_external_semaphore_handle_t hSemaphore, [[maybe_unused]] bool hasValue, [[maybe_unused]] uint64_t signalValue, [[maybe_unused]] uint32_t numEventsInWaitList, [[maybe_unused]] const ur_event_handle_t *phEventWaitList, diff --git a/source/adapters/opencl/kernel.cpp b/source/adapters/opencl/kernel.cpp index 41c6d6de70..617b6a9b2c 100644 --- a/source/adapters/opencl/kernel.cpp +++ b/source/adapters/opencl/kernel.cpp @@ -130,6 +130,10 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; } } + if (propName == UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE || + propName == UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE) { + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + } CL_RETURN_ON_FAILURE(clGetKernelWorkGroupInfo( cl_adapter::cast(hKernel), cl_adapter::cast(hDevice), @@ -206,19 +210,14 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, // Two calls to urDeviceGetInfo are needed: the first determines the size // required to store the result, and the second returns the actual size // values. - ur_result_t URRet = - urDeviceGetInfo(hDevice, UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL, 0, - nullptr, &ResultSize); - if (URRet != UR_RESULT_SUCCESS) { - return URRet; - } - assert(ResultSize % sizeof(size_t) == 0); - std::vector Result(ResultSize / sizeof(size_t)); - URRet = urDeviceGetInfo(hDevice, UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL, - ResultSize, Result.data(), nullptr); - if (URRet != UR_RESULT_SUCCESS) { - return URRet; - } + UR_RETURN_ON_FAILURE(urDeviceGetInfo(hDevice, + UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL, + 0, nullptr, &ResultSize)); + assert(ResultSize % sizeof(uint32_t) == 0); + std::vector Result(ResultSize / sizeof(uint32_t)); + UR_RETURN_ON_FAILURE(urDeviceGetInfo(hDevice, + UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL, + ResultSize, Result.data(), nullptr)); RetVal = *std::max_element(Result.begin(), Result.end()); Ret = CL_SUCCESS; } else if (propName == UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL) { @@ -307,7 +306,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo( switch (propName) { case UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS: { - if (*(static_cast(pPropValue)) == true) { + if (*(static_cast(pPropValue))) { UR_RETURN_ON_FAILURE(usmSetIndirectAccess(hKernel)); } return UR_RESULT_SUCCESS; @@ -361,13 +360,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( } UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( - ur_kernel_handle_t hKernel, size_t localWorkSize, - size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) { - (void)hKernel; - (void)localWorkSize; - (void)dynamicSharedMemorySize; - *pGroupCountRet = 1; - return UR_RESULT_SUCCESS; + [[maybe_unused]] ur_kernel_handle_t hKernel, + [[maybe_unused]] size_t localWorkSize, + [[maybe_unused]] size_t dynamicSharedMemorySize, + [[maybe_unused]] uint32_t *pGroupCountRet) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( @@ -430,3 +427,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( pGlobalWorkSize, pSuggestedLocalWorkSize)); return UR_RESULT_SUCCESS; } + +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants( + ur_kernel_handle_t, uint32_t, const ur_specialization_constant_info_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/source/adapters/opencl/program.cpp b/source/adapters/opencl/program.cpp index d843495b7e..f154a54051 100644 --- a/source/adapters/opencl/program.cpp +++ b/source/adapters/opencl/program.cpp @@ -159,8 +159,8 @@ static cl_int mapURProgramInfoToCL(ur_program_info_t URPropName) { return CL_PROGRAM_NUM_DEVICES; case UR_PROGRAM_INFO_DEVICES: return CL_PROGRAM_DEVICES; - case UR_PROGRAM_INFO_SOURCE: - return CL_PROGRAM_SOURCE; + case UR_PROGRAM_INFO_IL: + return CL_PROGRAM_IL; case UR_PROGRAM_INFO_BINARY_SIZES: return CL_PROGRAM_BINARY_SIZES; case UR_PROGRAM_INFO_BINARIES: diff --git a/source/adapters/opencl/sampler.cpp b/source/adapters/opencl/sampler.cpp index f05177a987..a47ba7f894 100644 --- a/source/adapters/opencl/sampler.cpp +++ b/source/adapters/opencl/sampler.cpp @@ -158,16 +158,38 @@ urSamplerGetInfo(ur_sampler_handle_t hSampler, ur_sampler_info_t propName, static_assert(sizeof(cl_addressing_mode) == sizeof(ur_sampler_addressing_mode_t)); - size_t CheckPropSize = 0; - ur_result_t Err = mapCLErrorToUR( - clGetSamplerInfo(cl_adapter::cast(hSampler), SamplerInfo, - propSize, pPropValue, &CheckPropSize)); - if (pPropValue && CheckPropSize != propSize) { - return UR_RESULT_ERROR_INVALID_SIZE; - } - UR_RETURN_ON_FAILURE(Err); - if (pPropSizeRet) { - *pPropSizeRet = CheckPropSize; + ur_result_t Err = UR_RESULT_SUCCESS; + // ur_bool_t have a size of uint8_t, but cl_bool size have the size of + // uint32_t so this adjust UR_SAMPLER_INFO_NORMALIZED_COORDS info to map + // between them. + if (propName == UR_SAMPLER_INFO_NORMALIZED_COORDS) { + cl_bool normalized_coords = false; + Err = mapCLErrorToUR( + clGetSamplerInfo(cl_adapter::cast(hSampler), SamplerInfo, + sizeof(cl_bool), &normalized_coords, nullptr)); + if (pPropValue && propSize != sizeof(ur_bool_t)) { + return UR_RESULT_ERROR_INVALID_SIZE; + } + UR_RETURN_ON_FAILURE(Err); + if (pPropValue) { + *static_cast(pPropValue) = + static_cast(normalized_coords); + } + if (pPropSizeRet) { + *pPropSizeRet = sizeof(ur_bool_t); + } + } else { + size_t CheckPropSize = 0; + Err = mapCLErrorToUR( + clGetSamplerInfo(cl_adapter::cast(hSampler), SamplerInfo, + propSize, pPropValue, &CheckPropSize)); + if (pPropValue && CheckPropSize != propSize) { + return UR_RESULT_ERROR_INVALID_SIZE; + } + UR_RETURN_ON_FAILURE(Err); + if (pPropSizeRet) { + *pPropSizeRet = CheckPropSize; + } } // Convert OpenCL returns to UR diff --git a/source/adapters/opencl/ur_interface_loader.cpp b/source/adapters/opencl/ur_interface_loader.cpp index 431cdf488c..6cd69d84d3 100644 --- a/source/adapters/opencl/ur_interface_loader.cpp +++ b/source/adapters/opencl/ur_interface_loader.cpp @@ -124,7 +124,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetArgSampler = urKernelSetArgSampler; pDdiTable->pfnSetArgValue = urKernelSetArgValue; pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; - pDdiTable->pfnSetSpecializationConstants = nullptr; + pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; return UR_RESULT_SUCCESS; } @@ -308,6 +308,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( pDdiTable->pfnCommandGetInfoExp = urCommandBufferCommandGetInfoExp; pDdiTable->pfnReleaseCommandExp = urCommandBufferReleaseCommandExp; pDdiTable->pfnRetainCommandExp = urCommandBufferRetainCommandExp; + pDdiTable->pfnUpdateWaitEventsExp = urCommandBufferUpdateWaitEventsExp; + pDdiTable->pfnUpdateSignalEventExp = urCommandBufferUpdateSignalEventExp; return retVal; } @@ -347,7 +349,10 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable( pDdiTable->pfnImportExternalMemoryExp = urBindlessImagesImportExternalMemoryExp; pDdiTable->pfnMapExternalArrayExp = urBindlessImagesMapExternalArrayExp; - pDdiTable->pfnReleaseInteropExp = urBindlessImagesReleaseInteropExp; + pDdiTable->pfnMapExternalLinearMemoryExp = + urBindlessImagesMapExternalLinearMemoryExp; + pDdiTable->pfnReleaseExternalMemoryExp = + urBindlessImagesReleaseExternalMemoryExp; pDdiTable->pfnImportExternalSemaphoreExp = urBindlessImagesImportExternalSemaphoreExp; pDdiTable->pfnReleaseExternalSemaphoreExp = diff --git a/source/adapters/opencl/usm.cpp b/source/adapters/opencl/usm.cpp index 77a2941e0f..03ee4a18f3 100644 --- a/source/adapters/opencl/usm.cpp +++ b/source/adapters/opencl/usm.cpp @@ -12,6 +12,13 @@ #include "common.hpp" +namespace umf { +ur_result_t getProviderNativeError(const char *, int32_t) { + // TODO: implement when UMF supports OpenCL + return UR_RESULT_ERROR_UNKNOWN; +} +} // namespace umf + inline cl_mem_alloc_flags_intel hostDescToClFlags(const ur_usm_host_desc_t &desc) { cl_mem_alloc_flags_intel allocFlags = 0; @@ -87,6 +94,11 @@ urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, void *Ptr = nullptr; uint32_t Alignment = pUSMDesc ? pUSMDesc->align : 0; + if (pUSMDesc && pUSMDesc->align != 0 && + ((pUSMDesc->align & (pUSMDesc->align - 1)) != 0)) { + return UR_RESULT_ERROR_INVALID_VALUE; + } + std::vector AllocProperties; if (pUSMDesc && pUSMDesc->pNext) { UR_RETURN_ON_FAILURE(usmDescToCLMemProperties( @@ -130,6 +142,11 @@ urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, void *Ptr = nullptr; uint32_t Alignment = pUSMDesc ? pUSMDesc->align : 0; + if (pUSMDesc && pUSMDesc->align != 0 && + ((pUSMDesc->align & (pUSMDesc->align - 1)) != 0)) { + return UR_RESULT_ERROR_INVALID_VALUE; + } + std::vector AllocProperties; if (pUSMDesc && pUSMDesc->pNext) { UR_RETURN_ON_FAILURE(usmDescToCLMemProperties( @@ -173,6 +190,11 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, void *Ptr = nullptr; uint32_t Alignment = pUSMDesc ? pUSMDesc->align : 0; + if (pUSMDesc && pUSMDesc->align != 0 && + ((pUSMDesc->align & (pUSMDesc->align - 1)) != 0)) { + return UR_RESULT_ERROR_INVALID_VALUE; + } + std::vector AllocProperties; if (pUSMDesc && pUSMDesc->pNext) { UR_RETURN_ON_FAILURE(usmDescToCLMemProperties( diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index 09591f3b32..1d3f699973 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -3,12 +3,16 @@ # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +if (UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_L0_V2) + include(FetchLevelZero) + set(UMF_BUILD_LEVEL_ZERO_PROVIDER ON CACHE INTERNAL "Build Level Zero Provider") + set(UMF_LEVEL_ZERO_INCLUDE_DIR "${LEVEL_ZERO_INCLUDE_DIR}" CACHE INTERNAL "Level Zero headers") +endif() + add_ur_library(ur_common STATIC - umf_helpers.hpp - umf_pools/disjoint_pool_config_parser.cpp - ur_pool_manager.hpp ur_util.cpp ur_util.hpp + latency_tracker.hpp $<$:windows/ur_lib_loader.cpp> $<$:linux/ur_lib_loader.cpp> ) @@ -26,8 +30,8 @@ if (NOT DEFINED UMF_REPO) endif() if (NOT DEFINED UMF_TAG) - # Merge pull request #119 from ldorau/Fix_arena_extent_split_and_arena_extent_merge - set(UMF_TAG 9bf7a0dc4dff76844e10edbb5c6e9d917536ef6d) + # main 03.10.2024: Add umfIsFreeOpDefault(hProvider) + set(UMF_TAG fa006eea503a58e79fa197891f1391c9dcda73e1) endif() message(STATUS "Will fetch Unified Memory Framework from ${UMF_REPO}") @@ -38,21 +42,41 @@ FetchContent_Declare(unified-memory-framework GIT_TAG ${UMF_TAG} ) -if(UR_BUILD_TESTS) - set(UMF_BUILD_TESTS ON CACHE INTERNAL "Build UMF tests") -else() - set(UMF_BUILD_TESTS OFF CACHE INTERNAL "Build UMF tests") +if (UR_STATIC_ADAPTER_L0) + if (UMF_BUILD_SHARED_LIBRARY) + message(STATUS "Static adapter is not compatible with shared UMF, switching to fully statically linked UMF") + set(UMF_BUILD_SHARED_LIBRARY OFF) + endif() endif() + +set(UMF_BUILD_TESTS OFF CACHE INTERNAL "Build UMF tests") +set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "Build UMF examples") +set(UMF_BUILD_SHARED_LIBRARY ${UMF_BUILD_SHARED_LIBRARY} CACHE INTERNAL "Build UMF shared library") set(UMF_BUILD_LIBUMF_POOL_DISJOINT ON CACHE INTERNAL "Build Disjoint Pool") -set(UMF_BUILD_OS_MEMORY_PROVIDER OFF CACHE INTERNAL "Build OS Provider") +set(UMF_BUILD_CUDA_PROVIDER OFF CACHE INTERNAL "Build UMF CUDA provider") FetchContent_MakeAvailable(unified-memory-framework) FetchContent_GetProperties(unified-memory-framework) +if(UR_ENABLE_LATENCY_HISTOGRAM) + set(HDR_HISTOGRAM_BUILD_STATIC CACHE INTERNAL ON "") + set(HDR_HISTOGRAM_BUILD_SHARED CACHE INTERNAL OFF "") + + include(FetchContent) + FetchContent_Declare(hdr_histogram + GIT_REPOSITORY https://github.com/HdrHistogram/HdrHistogram_c.git + GIT_TAG 0.11.8 + ) + + FetchContent_MakeAvailable(hdr_histogram) + FetchContent_GetProperties(hdr_histogram) + + target_link_libraries(ur_common PUBLIC hdr_histogram_static) + target_include_directories(ur_common PUBLIC $) + target_compile_options(ur_common PUBLIC -DUR_ENABLE_LATENCY_HISTOGRAM=1) +endif() + target_link_libraries(ur_common PUBLIC - unified-memory-framework::umf - unified-memory-framework::headers - unified-memory-framework::disjoint_pool ${CMAKE_DL_LIBS} ${PROJECT_NAME}::headers ) @@ -70,3 +94,18 @@ install(TARGETS ur_common RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) + +add_library(ur_umf INTERFACE) +target_sources(ur_umf INTERFACE + $ + $ + $ +) + +add_library(${PROJECT_NAME}::umf ALIAS ur_umf) + +target_link_libraries(ur_umf INTERFACE + umf::umf + umf::headers + umf::disjoint_pool +) diff --git a/source/common/latency_tracker.hpp b/source/common/latency_tracker.hpp new file mode 100644 index 0000000000..bf20e3819f --- /dev/null +++ b/source/common/latency_tracker.hpp @@ -0,0 +1,232 @@ +//===--------- latency_tracker.cpp - common ------------------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +#include "logger/ur_logger.hpp" + +#if defined(UR_ENABLE_LATENCY_HISTOGRAM) + +#include + +static inline bool trackLatency = []() { + try { + auto map = getenv_to_map("UR_LOG_LATENCY"); + + if (!map.has_value()) { + return false; + } + + auto it = map->find("level"); + return it != map->end() && + logger::str_to_level(it->second.front()) != logger::Level::QUIET; + } catch (...) { + return false; + } +}(); + +static constexpr size_t numPercentiles = 7; +static constexpr double percentiles[numPercentiles] = { + 50.0, 90.0, 99.0, 99.9, 99.99, 99.999, 99.9999}; + +struct latencyValues { + int64_t count; + int64_t min; + int64_t max; + int64_t mean; + int64_t stddev; + int64_t percentileValues[numPercentiles]; +}; + +using histogram_ptr = + std::unique_ptr; + +static inline latencyValues getValues(const struct hdr_histogram *histogram) { + latencyValues values; + values.count = histogram->total_count; + values.max = hdr_max(histogram); + values.min = hdr_min(histogram); + values.mean = static_cast(hdr_mean(histogram)); + values.stddev = static_cast(hdr_stddev(histogram)); + + auto ret = hdr_value_at_percentiles( + histogram, percentiles, values.percentileValues, numPercentiles); + if (ret != 0) { + logger::error("Failed to get percentiles from latency histogram"); + } + + return values; +} + +class latency_printer { + public: + latency_printer() : logger(logger::create_logger("latency", true, false)) {} + + inline void publishLatency(const std::string &name, + histogram_ptr histogram) { + auto [it, inserted] = values.try_emplace(name, std::move(histogram)); + if (!inserted) { + // combine histograms + hdr_add(it->second.get(), histogram.get()); + } + } + + inline ~latency_printer() { + if (trackLatency) { + print(); + } + } + + inline void print() { + printHeader(); + + for (auto &[name, histogram] : values) { + auto value = getValues(histogram.get()); + auto f = groupDigits; + logger.log( + logger::Level::INFO, + "{},{},{},{},{},{},{},{},{},{},{},{},{},{},ns", name, + f(value.mean), f(value.percentileValues[0]), + f(value.percentileValues[1]), f(value.percentileValues[2]), + f(value.percentileValues[3]), f(value.percentileValues[4]), + f(value.percentileValues[5]), f(value.percentileValues[6]), + f(value.count), f(value.count * value.mean), f(value.min), + f(value.max), value.stddev); + } + } + + private: + inline void printHeader() { + logger.log(logger::Level::INFO, "Latency histogram:"); + logger.log(logger::Level::INFO, + "name,mean,p{},p{},p{},p{},p{},p{}" + ",p{},count,sum,min,max,stdev,unit", + percentiles[0], percentiles[1], percentiles[2], + percentiles[3], percentiles[4], percentiles[5], + percentiles[6]); + } + + std::map values; + logger::Logger logger; +}; + +inline latency_printer &globalLatencyPrinter() { + static latency_printer printer; + return printer; +} + +class latency_histogram { + public: + inline latency_histogram(const char *name, + latency_printer &printer = globalLatencyPrinter(), + int64_t lowestDiscernibleValue = 1, + int64_t highestTrackableValue = 100'000'000'000, + int significantFigures = 3) + : name(name), histogram(nullptr, nullptr), printer(printer) { + if (trackLatency) { + struct hdr_histogram *cHistogram; + auto ret = hdr_init(lowestDiscernibleValue, highestTrackableValue, + significantFigures, &cHistogram); + if (ret != 0) { + logger::error("Failed to initialize latency histogram"); + } + histogram = + std::unique_ptr( + cHistogram, &hdr_close); + } + } + + latency_histogram(const latency_histogram &) = delete; + latency_histogram(latency_histogram &&) = delete; + + inline ~latency_histogram() { + if (!trackLatency || !histogram) { + return; + } + + if (hdr_min(histogram.get()) == std::numeric_limits::max()) { + logger::info("[{}] latency: no data", name); + return; + } + + printer.publishLatency(name, std::move(histogram)); + } + + inline void trackValue(int64_t value) { + hdr_record_value(histogram.get(), value); + } + + private: + const char *name; + histogram_ptr histogram; + latency_printer &printer; +}; + +class latency_tracker { + public: + inline explicit latency_tracker(latency_histogram &stats) + : stats(trackLatency ? &stats : nullptr), begin() { + if (trackLatency) { + begin = std::chrono::steady_clock::now(); + } + } + inline latency_tracker() {} + inline ~latency_tracker() { + if (stats) { + auto tp = std::chrono::steady_clock::now(); + auto diffNanos = + std::chrono::duration_cast(tp - begin) + .count(); + stats->trackValue(static_cast(diffNanos)); + } + } + + latency_tracker(const latency_tracker &) = delete; + latency_tracker &operator=(const latency_tracker &) = delete; + + inline latency_tracker(latency_tracker &&rhs) noexcept + : stats(rhs.stats), begin(rhs.begin) { + rhs.stats = nullptr; + } + + inline latency_tracker &operator=(latency_tracker &&rhs) noexcept { + if (this != &rhs) { + this->~latency_tracker(); + new (this) latency_tracker(std::move(rhs)); + } + return *this; + } + + private: + latency_histogram *stats{nullptr}; + std::chrono::time_point begin; +}; + +// To resolve __COUNTER__ +#define CONCAT(a, b) a##b + +// Each tracker has it's own thread-local histogram. +// At program exit, all histograms for the same scope are +// aggregated. +#define TRACK_SCOPE_LATENCY_CNT(name, cnt) \ + static thread_local latency_histogram CONCAT(histogram, cnt)(name); \ + latency_tracker CONCAT(tracker, cnt)(CONCAT(histogram, cnt)); +#define TRACK_SCOPE_LATENCY(name) TRACK_SCOPE_LATENCY_CNT(name, __COUNTER__) + +#else // UR_ENABLE_LATENCY_HISTOGRAM + +#define TRACK_SCOPE_LATENCY(name) + +#endif // UR_ENABLE_LATENCY_HISTOGRAM diff --git a/source/common/linux/ur_lib_loader.cpp b/source/common/linux/ur_lib_loader.cpp index d316e48b74..4da7f98bc1 100644 --- a/source/common/linux/ur_lib_loader.cpp +++ b/source/common/linux/ur_lib_loader.cpp @@ -23,6 +23,8 @@ void LibLoader::freeAdapterLibrary(HMODULE handle) { logger::error( "Failed to unload the library with the handle at address {}", handle); + } else { + logger::info("unloaded adapter 0x{}", handle); } } } @@ -42,8 +44,15 @@ LibLoader::loadAdapterLibrary(const char *name) { mode |= RTLD_DEEPBIND; } #endif - - return std::unique_ptr(dlopen(name, mode)); + HMODULE handle = dlopen(name, mode); + if (!handle) { + char *err = dlerror(); + logger::info("failed to load adapter '{}' with error: {}", name, + err ? err : "unknown error"); + } else { + logger::info("loaded adapter 0x{} ({})", handle, name); + } + return std::unique_ptr(handle); } void *LibLoader::getFunctionPtr(HMODULE handle, const char *func_name) { diff --git a/source/common/logger/ur_logger_details.hpp b/source/common/logger/ur_logger_details.hpp index f17d3b3f64..9c9462935e 100644 --- a/source/common/logger/ur_logger_details.hpp +++ b/source/common/logger/ur_logger_details.hpp @@ -30,6 +30,8 @@ class Logger { void setLevel(logger::Level level) { this->level = level; } + logger::Level getLevel() { return this->level; } + void setFlushLevel(logger::Level level) { if (sink) { this->sink->setFlushLevel(level); diff --git a/source/common/logger/ur_sinks.hpp b/source/common/logger/ur_sinks.hpp index b2ebf72a7a..e0d8144a31 100644 --- a/source/common/logger/ur_sinks.hpp +++ b/source/common/logger/ur_sinks.hpp @@ -17,6 +17,10 @@ namespace logger { +#if defined(_WIN32) +inline bool isTearDowned = false; +#endif + class Sink { public: template @@ -28,7 +32,21 @@ class Sink { } format(buffer, fmt, std::forward(args)...); +// This is a temporary workaround on windows, where UR adapter is teardowned +// before the UR loader, which will result in access violation when we use print +// function as the overrided print function was already released with the UR +// adapter. +// TODO: Change adapters to use a common sink class in the loader instead of +// using thier own sink class that inherit from logger::Sink. +#if defined(_WIN32) + if (isTearDowned) { + std::cerr << buffer.str() << "\n"; + } else { + print(level, buffer.str()); + } +#else print(level, buffer.str()); +#endif } void setFlushLevel(logger::Level level) { this->flush_level = level; } diff --git a/source/common/stype_map_helpers.def b/source/common/stype_map_helpers.def index 270ae7ba4f..c938ca6b95 100644 --- a/source/common/stype_map_helpers.def +++ b/source/common/stype_map_helpers.def @@ -84,9 +84,9 @@ struct stype_map : stype_map_impl template <> struct stype_map : stype_map_impl {}; template <> -struct stype_map : stype_map_impl {}; +struct stype_map : stype_map_impl {}; template <> -struct stype_map : stype_map_impl {}; +struct stype_map : stype_map_impl {}; template <> struct stype_map : stype_map_impl {}; template <> diff --git a/source/common/umf_helpers.hpp b/source/common/umf_helpers.hpp index 2cbebf3670..d067b8ab1a 100644 --- a/source/common/umf_helpers.hpp +++ b/source/common/umf_helpers.hpp @@ -1,6 +1,6 @@ /* * - * Copyright (C) 2023 Intel Corporation + * Copyright (C) 2023-2024 Intel Corporation * * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. * See LICENSE.TXT @@ -17,6 +17,8 @@ #include #include +#include "logger/ur_logger.hpp" + #include #include #include @@ -33,6 +35,35 @@ using provider_unique_handle_t = std::unique_ptr>; +#define DEFINE_CHECK_OP(op) \ + template class HAS_OP_##op { \ + typedef char check_success; \ + typedef long check_fail; \ + template static check_success test(decltype(&U::op)); \ + template static check_fail test(...); \ + \ + public: \ + static constexpr bool value = \ + sizeof(test(0)) == sizeof(check_success); \ + }; \ + \ + template \ + static inline \ + typename std::enable_if::value, umf_result_t>::type \ + CALL_OP_##op(T *t, Args &&...args) { \ + return t->op(std::forward(args)...); \ + } \ + \ + static inline umf_result_t CALL_OP_##op(...) { \ + return UMF_RESULT_ERROR_NOT_SUPPORTED; \ + } + +DEFINE_CHECK_OP(get_ipc_handle_size) +DEFINE_CHECK_OP(get_ipc_handle) +DEFINE_CHECK_OP(put_ipc_handle) +DEFINE_CHECK_OP(open_ipc_handle) +DEFINE_CHECK_OP(close_ipc_handle) + #define UMF_ASSIGN_OP(ops, type, func, default_return) \ ops.func = [](void *obj, auto... args) { \ try { \ @@ -50,6 +81,15 @@ using provider_unique_handle_t = } \ } +#define UMF_ASSIGN_OP_OPT(ops, type, func, default_return) \ + ops.func = [](void *obj, auto... args) { \ + try { \ + return CALL_OP_##func(reinterpret_cast(obj), args...); \ + } catch (...) { \ + return default_return; \ + } \ + } + namespace detail { template umf_result_t initialize(T *obj, ArgsTuple &&args) { @@ -69,7 +109,7 @@ umf_result_t initialize(T *obj, ArgsTuple &&args) { template umf_memory_pool_ops_t poolMakeUniqueOps() { - umf_memory_pool_ops_t ops; + umf_memory_pool_ops_t ops = {}; ops.version = UMF_VERSION_CURRENT; ops.initialize = [](umf_memory_provider_handle_t provider, void *params, @@ -106,7 +146,7 @@ umf_memory_pool_ops_t poolMakeUniqueOps() { /// forwarded to T::initialize(). template auto memoryProviderMakeUnique(Args &&...args) { - umf_memory_provider_ops_t ops; + umf_memory_provider_ops_t ops = {}; auto argsTuple = std::make_tuple(std::forward(args)...); ops.version = UMF_VERSION_CURRENT; @@ -124,13 +164,21 @@ auto memoryProviderMakeUnique(Args &&...args) { ops.finalize = [](void *obj) { delete reinterpret_cast(obj); }; UMF_ASSIGN_OP(ops, T, alloc, UMF_RESULT_ERROR_UNKNOWN); - UMF_ASSIGN_OP(ops, T, free, UMF_RESULT_ERROR_UNKNOWN); UMF_ASSIGN_OP_NORETURN(ops, T, get_last_native_error); UMF_ASSIGN_OP(ops, T, get_recommended_page_size, UMF_RESULT_ERROR_UNKNOWN); UMF_ASSIGN_OP(ops, T, get_min_page_size, UMF_RESULT_ERROR_UNKNOWN); - UMF_ASSIGN_OP(ops, T, purge_lazy, UMF_RESULT_ERROR_UNKNOWN); - UMF_ASSIGN_OP(ops, T, purge_force, UMF_RESULT_ERROR_UNKNOWN); UMF_ASSIGN_OP(ops, T, get_name, ""); + UMF_ASSIGN_OP(ops.ext, T, free, UMF_RESULT_ERROR_UNKNOWN); + UMF_ASSIGN_OP(ops.ext, T, purge_lazy, UMF_RESULT_ERROR_UNKNOWN); + UMF_ASSIGN_OP(ops.ext, T, purge_force, UMF_RESULT_ERROR_UNKNOWN); + UMF_ASSIGN_OP(ops.ext, T, allocation_merge, UMF_RESULT_ERROR_UNKNOWN); + UMF_ASSIGN_OP(ops.ext, T, allocation_split, UMF_RESULT_ERROR_UNKNOWN); + UMF_ASSIGN_OP_OPT(ops.ipc, T, get_ipc_handle_size, + UMF_RESULT_ERROR_UNKNOWN); + UMF_ASSIGN_OP_OPT(ops.ipc, T, get_ipc_handle, UMF_RESULT_ERROR_UNKNOWN); + UMF_ASSIGN_OP_OPT(ops.ipc, T, put_ipc_handle, UMF_RESULT_ERROR_UNKNOWN); + UMF_ASSIGN_OP_OPT(ops.ipc, T, open_ipc_handle, UMF_RESULT_ERROR_UNKNOWN); + UMF_ASSIGN_OP_OPT(ops.ipc, T, close_ipc_handle, UMF_RESULT_ERROR_UNKNOWN); umf_memory_provider_handle_t hProvider = nullptr; auto ret = umfMemoryProviderCreate(&ops, &argsTuple, &hProvider); @@ -146,40 +194,46 @@ auto poolMakeUnique(provider_unique_handle_t provider, Args &&...args) { auto argsTuple = std::make_tuple(std::forward(args)...); auto ops = detail::poolMakeUniqueOps(); - auto hProvider = provider.release(); - - // capture providers and destroy them after the pool is destroyed - auto poolDestructor = [hProvider](umf_memory_pool_handle_t hPool) { - umfPoolDestroy(hPool); - umfMemoryProviderDestroy(hProvider); - }; - umf_memory_pool_handle_t hPool = nullptr; - auto ret = umfPoolCreate(&ops, hProvider, &argsTuple, &hPool); + + auto ret = umfPoolCreate(&ops, provider.get(), &argsTuple, + UMF_POOL_CREATE_FLAG_OWN_PROVIDER, &hPool); + if (ret == UMF_RESULT_SUCCESS) { + provider.release(); // pool now owns the provider + } return std::pair{ - ret, pool_unique_handle_t(hPool, std::move(poolDestructor))}; + ret, pool_unique_handle_t(hPool, umfPoolDestroy)}; } static inline auto poolMakeUniqueFromOps(umf_memory_pool_ops_t *ops, provider_unique_handle_t provider, void *params) { umf_memory_pool_handle_t hPool; - auto ret = umfPoolCreate(ops, provider.get(), params, &hPool); + auto ret = umfPoolCreate(ops, provider.get(), params, + UMF_POOL_CREATE_FLAG_OWN_PROVIDER, &hPool); if (ret != UMF_RESULT_SUCCESS) { return std::pair{ ret, pool_unique_handle_t(nullptr, nullptr)}; } - // capture provider and destroy it after the pool is destroyed - auto poolDestructor = - [provider_handle = provider.release()](umf_memory_pool_handle_t pool) { - umfPoolDestroy(pool); - umfMemoryProviderDestroy(provider_handle); - }; + provider.release(); // pool now owns the provider return std::pair{ + UMF_RESULT_SUCCESS, pool_unique_handle_t(hPool, umfPoolDestroy)}; +} + +static inline auto providerMakeUniqueFromOps(umf_memory_provider_ops_t *ops, + void *params) { + umf_memory_provider_handle_t hProvider; + auto ret = umfMemoryProviderCreate(ops, params, &hProvider); + if (ret != UMF_RESULT_SUCCESS) { + return std::pair{ + ret, provider_unique_handle_t(nullptr, nullptr)}; + } + + return std::pair{ UMF_RESULT_SUCCESS, - pool_unique_handle_t(hPool, std::move(poolDestructor))}; + provider_unique_handle_t(hProvider, umfMemoryProviderDestroy)}; } template umf_result_t &getPoolLastStatusRef() { @@ -187,6 +241,9 @@ template umf_result_t &getPoolLastStatusRef() { return last_status; } +ur_result_t getProviderNativeError(const char *providerName, + int32_t nativeError); + /// @brief translates UMF return values to UR. /// This function assumes that the native error of /// the last failed memory provider is ur_result_t. @@ -202,10 +259,15 @@ inline ur_result_t umf2urResult(umf_result_t umfResult) { return UR_RESULT_ERROR_UNKNOWN; } - ur_result_t Err = UR_RESULT_ERROR_UNKNOWN; - umfMemoryProviderGetLastNativeError(hProvider, nullptr, - reinterpret_cast(&Err)); - return Err; + int32_t Err = UR_RESULT_ERROR_UNKNOWN; + const char *Msg = nullptr; + umfMemoryProviderGetLastNativeError(hProvider, &Msg, &Err); + + if (Msg) { + logger::error("UMF failed with: {}", Msg); + } + + return getProviderNativeError(umfMemoryProviderGetName(hProvider), Err); } case UMF_RESULT_ERROR_INVALID_ARGUMENT: return UR_RESULT_ERROR_INVALID_ARGUMENT; diff --git a/source/common/ur_pool_manager.hpp b/source/common/ur_pool_manager.hpp index dea80ae5f2..d1f90ccfa6 100644 --- a/source/common/ur_pool_manager.hpp +++ b/source/common/ur_pool_manager.hpp @@ -11,6 +11,8 @@ #ifndef USM_POOL_MANAGER_HPP #define USM_POOL_MANAGER_HPP 1 +#include + #include "logger/ur_logger.hpp" #include "umf_helpers.hpp" #include "ur_api.h" @@ -26,6 +28,26 @@ namespace usm { +namespace detail { +struct ddiTables { + ddiTables() { + auto ret = + urGetDeviceProcAddrTable(UR_API_VERSION_CURRENT, &deviceDdiTable); + if (ret != UR_RESULT_SUCCESS) { + throw ret; + } + + ret = + urGetContextProcAddrTable(UR_API_VERSION_CURRENT, &contextDdiTable); + if (ret != UR_RESULT_SUCCESS) { + throw ret; + } + } + ur_device_dditable_t deviceDdiTable; + ur_context_dditable_t contextDdiTable; +}; +} // namespace detail + /// @brief describes an internal USM pool instance. struct pool_descriptor { ur_usm_pool_handle_t poolHandle; @@ -44,9 +66,12 @@ struct pool_descriptor { static inline std::pair> urGetSubDevices(ur_device_handle_t hDevice) { + static detail::ddiTables ddi; + uint32_t nComputeUnits; - auto ret = urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_COMPUTE_UNITS, - sizeof(nComputeUnits), &nComputeUnits, nullptr); + auto ret = ddi.deviceDdiTable.pfnGetInfo( + hDevice, UR_DEVICE_INFO_MAX_COMPUTE_UNITS, sizeof(nComputeUnits), + &nComputeUnits, nullptr); if (ret != UR_RESULT_SUCCESS) { return {ret, {}}; } @@ -64,15 +89,16 @@ urGetSubDevices(ur_device_handle_t hDevice) { // Get the number of devices that will be created uint32_t deviceCount; - ret = urDevicePartition(hDevice, &properties, 0, nullptr, &deviceCount); + ret = ddi.deviceDdiTable.pfnPartition(hDevice, &properties, 0, nullptr, + &deviceCount); if (ret != UR_RESULT_SUCCESS) { return {ret, {}}; } std::vector sub_devices(deviceCount); - ret = urDevicePartition(hDevice, &properties, - static_cast(sub_devices.size()), - sub_devices.data(), nullptr); + ret = ddi.deviceDdiTable.pfnPartition( + hDevice, &properties, static_cast(sub_devices.size()), + sub_devices.data(), nullptr); if (ret != UR_RESULT_SUCCESS) { return {ret, {}}; } @@ -82,17 +108,20 @@ urGetSubDevices(ur_device_handle_t hDevice) { inline std::pair> urGetAllDevicesAndSubDevices(ur_context_handle_t hContext) { + static detail::ddiTables ddi; + size_t deviceCount = 0; - auto ret = urContextGetInfo(hContext, UR_CONTEXT_INFO_NUM_DEVICES, - sizeof(deviceCount), &deviceCount, nullptr); + auto ret = ddi.contextDdiTable.pfnGetInfo( + hContext, UR_CONTEXT_INFO_NUM_DEVICES, sizeof(deviceCount), + &deviceCount, nullptr); if (ret != UR_RESULT_SUCCESS || deviceCount == 0) { return {ret, {}}; } std::vector devices(deviceCount); - ret = urContextGetInfo(hContext, UR_CONTEXT_INFO_DEVICES, - sizeof(ur_device_handle_t) * deviceCount, - devices.data(), nullptr); + ret = ddi.contextDdiTable.pfnGetInfo( + hContext, UR_CONTEXT_INFO_DEVICES, + sizeof(ur_device_handle_t) * deviceCount, devices.data(), nullptr); if (ret != UR_RESULT_SUCCESS) { return {ret, {}}; } @@ -135,6 +164,8 @@ isSharedAllocationReadOnlyOnDevice(const pool_descriptor &desc) { } inline bool pool_descriptor::operator==(const pool_descriptor &other) const { + static usm::detail::ddiTables ddi; + const pool_descriptor &lhs = *this; const pool_descriptor &rhs = other; ur_native_handle_t lhsNative = 0, rhsNative = 0; @@ -145,14 +176,16 @@ inline bool pool_descriptor::operator==(const pool_descriptor &other) const { // Ref: https://github.com/intel/llvm/commit/86511c5dc84b5781dcfd828caadcb5cac157eae1 // TODO: is this L0 specific? if (lhs.hDevice) { - auto ret = urDeviceGetNativeHandle(lhs.hDevice, &lhsNative); + auto ret = + ddi.deviceDdiTable.pfnGetNativeHandle(lhs.hDevice, &lhsNative); if (ret != UR_RESULT_SUCCESS) { throw ret; } } if (rhs.hDevice) { - auto ret = urDeviceGetNativeHandle(rhs.hDevice, &rhsNative); + auto ret = + ddi.deviceDdiTable.pfnGetNativeHandle(rhs.hDevice, &rhsNative); if (ret != UR_RESULT_SUCCESS) { throw ret; } @@ -223,11 +256,11 @@ template struct pool_manager { public: static std::pair - create(desc_to_pool_map_t descToHandleMap = {}) { + create(desc_to_pool_map_t &&descToHandleMap = {}) { auto manager = pool_manager(); for (auto &[desc, hPool] : descToHandleMap) { - auto ret = manager.addPool(desc, hPool); + auto ret = manager.addPool(desc, std::move(hPool)); if (ret != UR_RESULT_SUCCESS) { return {ret, pool_manager()}; } @@ -237,7 +270,7 @@ template struct pool_manager { } ur_result_t addPool(const D &desc, - umf::pool_unique_handle_t &hPool) noexcept { + umf::pool_unique_handle_t &&hPool) noexcept { if (!descToPoolMap.try_emplace(desc, std::move(hPool)).second) { logger::error("Pool for pool descriptor: {}, already exists", desc); return UR_RESULT_ERROR_INVALID_ARGUMENT; @@ -264,9 +297,12 @@ namespace std { /// @brief hash specialization for usm::pool_descriptor template <> struct hash { inline size_t operator()(const usm::pool_descriptor &desc) const { + static usm::detail::ddiTables ddi; + ur_native_handle_t native = 0; if (desc.hDevice) { - auto ret = urDeviceGetNativeHandle(desc.hDevice, &native); + auto ret = + ddi.deviceDdiTable.pfnGetNativeHandle(desc.hDevice, &native); if (ret != UR_RESULT_SUCCESS) { throw ret; } diff --git a/source/common/ur_util.cpp b/source/common/ur_util.cpp index e486ff6e1a..176a2e028e 100644 --- a/source/common/ur_util.cpp +++ b/source/common/ur_util.cpp @@ -1,6 +1,6 @@ /* * - * Copyright (C) 2022-2023 Intel Corporation + * Copyright (C) 2022-2024 Intel Corporation * * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. * See LICENSE.TXT @@ -9,15 +9,63 @@ */ #include "ur_util.hpp" +#include "logger/ur_logger.hpp" #ifdef _WIN32 #include int ur_getpid(void) { return static_cast(GetCurrentProcessId()); } + +int ur_close_fd(int fd) { return -1; } + +int ur_duplicate_fd(int pid, int fd_in) { + // TODO: find another way to obtain a duplicate of another process's file descriptor + (void)pid; // unused + (void)fd_in; // unused + return -1; +} + #else +#include #include int ur_getpid(void) { return static_cast(getpid()); } -#endif + +int ur_close_fd(int fd) { return close(fd); } + +int ur_duplicate_fd(int pid, int fd_in) { +// pidfd_getfd(2) is used to obtain a duplicate of another process's file descriptor. +// Permission to duplicate another process's file descriptor +// is governed by a ptrace access mode PTRACE_MODE_ATTACH_REALCREDS check (see ptrace(2)) +// that can be changed using the /proc/sys/kernel/yama/ptrace_scope interface. +// pidfd_getfd(2) is supported since Linux 5.6 +// pidfd_open(2) is supported since Linux 5.3 +#if defined(__NR_pidfd_open) && defined(__NR_pidfd_getfd) + errno = 0; + int pid_fd = syscall(__NR_pidfd_open, pid, 0); + if (pid_fd == -1) { + logger::error("__NR_pidfd_open"); + return -1; + } + + int fd_dup = syscall(__NR_pidfd_getfd, pid_fd, fd_in, 0); + close(pid_fd); + if (fd_dup == -1) { + logger::error("__NR_pidfd_getfd"); + return -1; + } + + return fd_dup; +#else + // TODO: find another way to obtain a duplicate of another process's file descriptor + (void)pid; // unused + (void)fd_in; // unused + errno = ENOTSUP; // unsupported + logger::error("__NR_pidfd_open or __NR_pidfd_getfd not available"); + return -1; +#endif /* defined(__NR_pidfd_open) && defined(__NR_pidfd_getfd) */ +} + +#endif /* _WIN32 */ std::optional ur_getenv(const char *name) { #if defined(_WIN32) diff --git a/source/common/ur_util.hpp b/source/common/ur_util.hpp index e27d7103be..0ede3c93dc 100644 --- a/source/common/ur_util.hpp +++ b/source/common/ur_util.hpp @@ -1,6 +1,6 @@ /* * - * Copyright (C) 2022-2023 Intel Corporation + * Copyright (C) 2022-2024 Intel Corporation * * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. * See LICENSE.TXT @@ -27,6 +27,8 @@ #include int ur_getpid(void); +int ur_close_fd(int fd); +int ur_duplicate_fd(int pid, int fd_in); /* for compatibility with non-clang compilers */ #if defined(__has_feature) @@ -58,12 +60,14 @@ int ur_getpid(void); /////////////////////////////////////////////////////////////////////////////// #if defined(_WIN32) #define MAKE_LIBRARY_NAME(NAME, VERSION) NAME ".dll" +#define STATIC_LIBRARY_EXTENSION ".lib" #else #if defined(__APPLE__) #define MAKE_LIBRARY_NAME(NAME, VERSION) "lib" NAME "." VERSION ".dylib" #else #define MAKE_LIBRARY_NAME(NAME, VERSION) "lib" NAME ".so." VERSION #endif +#define STATIC_LIBRARY_EXTENSION ".a" #endif inline std::string create_library_path(const char *name, const char *path) { @@ -460,7 +464,7 @@ template class AtomicSingleton { static int release(std::function deleter) { auto val = instance.acquire(); - int ret = val->release(deleter); + int ret = val->release(std::move(deleter)); instance.release(); return ret; @@ -476,6 +480,25 @@ template class AtomicSingleton { } }; +template +static inline std::string groupDigits(Numeric numeric) { + auto number = std::to_string(numeric); + std::string sign = numeric >= 0 ? "" : "-"; + auto digits = number.substr(sign.size(), number.size() - sign.size()); + + std::string separated; + + for (size_t i = 0; i < digits.size(); i++) { + separated.push_back(digits[i]); + + if (i != digits.size() - 1 && (digits.size() - i - 1) % 3 == 0) { + separated.push_back('\''); + } + } + + return sign + separated; +} + template Spinlock> AtomicSingleton::instance; #endif /* UR_UTIL_H */ diff --git a/source/loader/CMakeLists.txt b/source/loader/CMakeLists.txt index 35f32a5424..07dab17943 100644 --- a/source/loader/CMakeLists.txt +++ b/source/loader/CMakeLists.txt @@ -19,6 +19,7 @@ add_ur_library(ur_loader "" ${CMAKE_CURRENT_BINARY_DIR}/UrLoaderVersion.rc ) +install_ur_library(ur_loader) if (MSVC) set(TARGET_LIBNAME ur_loader) @@ -47,6 +48,7 @@ add_library(${PROJECT_NAME}::loader ALIAS ur_loader) target_include_directories(ur_loader PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/layers ) @@ -60,6 +62,13 @@ target_link_libraries(ur_loader PRIVATE ${PROJECT_NAME}::headers ) +if(UR_STATIC_ADAPTER_L0) + target_link_libraries(ur_loader PRIVATE + ur_adapter_level_zero + ) + target_compile_definitions(ur_loader PRIVATE UR_STATIC_ADAPTER_LEVEL_ZERO) +endif() + if(UR_ENABLE_TRACING) target_link_libraries(ur_loader PRIVATE ${TARGET_XPTI}) target_include_directories(ur_loader PRIVATE ${xpti_SOURCE_DIR}/include) @@ -93,13 +102,6 @@ if(UNIX) install(FILES "${CMAKE_CURRENT_BINARY_DIR}/libur_loader.pc" DESTINATION "${CMAKE_INSTALL_FULL_LIBDIR}/pkgconfig" COMPONENT unified-runtime) endif() -install(TARGETS ur_loader - EXPORT ${PROJECT_NAME}-targets - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT unified-runtime -) - target_sources(ur_loader PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/ur_object.hpp @@ -136,12 +138,18 @@ if(UR_ENABLE_SANITIZER) ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_interceptor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_interceptor.hpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_libdevice.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_options.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_options.hpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_quarantine.cpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_quarantine.hpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_report.cpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_report.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_shadow_setup.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_shadow_setup.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_shadow.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_shadow.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_statistics.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_statistics.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_validator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_validator.hpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/common.hpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/stacktrace.cpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/stacktrace.hpp @@ -158,6 +166,35 @@ if(UR_ENABLE_SANITIZER) ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/linux/sanitizer_utils.cpp ) + if(UR_ENABLE_SYMBOLIZER) + target_sources(ur_loader + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/linux/symbolizer.cpp + ) + target_include_directories(ur_loader PRIVATE ${LLVM_INCLUDE_DIRS}) + target_link_libraries(ur_loader PRIVATE LLVMSupport LLVMSymbolize) + # In in-tree build, if LLVM is built with libc++, we also need to build + # symbolizer.cpp with libc++ abi and link libc++ in. + if(NOT UR_STANDALONE_BUILD AND LLVM_LIBCXX_USED) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --print-file-name=libc++.a + OUTPUT_VARIABLE LIBCXX_PATH + OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --print-file-name=libc++abi.a + OUTPUT_VARIABLE LIBCXX_ABI_PATH + OUTPUT_STRIP_TRAILING_WHITESPACE) + set_property(SOURCE + ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/linux/symbolizer.cpp + APPEND_STRING PROPERTY COMPILE_FLAGS + " -stdlib=libc++ ") + if(NOT EXISTS ${LIBCXX_PATH} OR NOT EXISTS ${LIBCXX_ABI_PATH}) + message(FATAL_ERROR "libc++ is required but can't find the libraries") + endif() + target_link_libraries(ur_loader PRIVATE ${LIBCXX_PATH} ${LIBCXX_ABI_PATH}) + endif() + endif() + target_include_directories(ur_loader PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer" "${CMAKE_CURRENT_SOURCE_DIR}/../" diff --git a/source/loader/layers/sanitizer/asan_allocator.hpp b/source/loader/layers/sanitizer/asan_allocator.hpp index 88dfd2d074..249ef896d0 100644 --- a/source/loader/layers/sanitizer/asan_allocator.hpp +++ b/source/loader/layers/sanitizer/asan_allocator.hpp @@ -45,6 +45,7 @@ struct AllocInfo { StackTrace ReleaseStack; void print(); + size_t getRedzoneSize() { return AllocSize - (UserEnd - UserBegin); } }; using AllocationMap = std::map>; diff --git a/source/loader/layers/sanitizer/asan_buffer.cpp b/source/loader/layers/sanitizer/asan_buffer.cpp index 4cf90c7da4..9316d68bf4 100644 --- a/source/loader/layers/sanitizer/asan_buffer.cpp +++ b/source/loader/layers/sanitizer/asan_buffer.cpp @@ -75,12 +75,23 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) { return UR_RESULT_SUCCESS; } + // Device may be null, we follow the L0 adapter's practice to use the first + // device + if (!Device) { + auto Devices = GetDevices(Context); + assert(Devices.size() > 0 && "Devices should not be empty"); + Device = Devices[0]; + } + assert((void *)Device != nullptr && "Device cannot be nullptr"); + + std::scoped_lock Guard(Mutex); auto &Allocation = Allocations[Device]; + ur_result_t URes = UR_RESULT_SUCCESS; if (!Allocation) { ur_usm_desc_t USMDesc{}; USMDesc.align = getAlignment(); ur_usm_pool_handle_t Pool{}; - ur_result_t URes = getContext()->interceptor->allocateMemory( + URes = getContext()->interceptor->allocateMemory( Context, Device, &USMDesc, Pool, Size, AllocType::MEM_BUFFER, ur_cast(&Allocation)); if (URes != UR_RESULT_SUCCESS) { @@ -105,7 +116,60 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) { Handle = Allocation; - return UR_RESULT_SUCCESS; + if (!LastSyncedDevice.hDevice) { + LastSyncedDevice = MemBuffer::Device_t{Device, Handle}; + return URes; + } + + // If the device required to allocate memory is not the previous one, we + // need to do data migration. + if (Device != LastSyncedDevice.hDevice) { + auto &HostAllocation = Allocations[nullptr]; + if (!HostAllocation) { + ur_usm_desc_t USMDesc{}; + USMDesc.align = getAlignment(); + ur_usm_pool_handle_t Pool{}; + URes = getContext()->interceptor->allocateMemory( + Context, nullptr, &USMDesc, Pool, Size, AllocType::HOST_USM, + ur_cast(&HostAllocation)); + if (URes != UR_RESULT_SUCCESS) { + getContext()->logger.error("Failed to allocate {} bytes host " + "USM for buffer {} migration", + Size, this); + return URes; + } + } + + // Copy data from last synced device to host + { + ManagedQueue Queue(Context, LastSyncedDevice.hDevice); + URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( + Queue, true, HostAllocation, LastSyncedDevice.MemHandle, Size, + 0, nullptr, nullptr); + if (URes != UR_RESULT_SUCCESS) { + getContext()->logger.error( + "Failed to migrate memory buffer data"); + return URes; + } + } + + // Sync data back to device + { + ManagedQueue Queue(Context, Device); + URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( + Queue, true, Allocation, HostAllocation, Size, 0, nullptr, + nullptr); + if (URes != UR_RESULT_SUCCESS) { + getContext()->logger.error( + "Failed to migrate memory buffer data"); + return URes; + } + } + } + + LastSyncedDevice = MemBuffer::Device_t{Device, Handle}; + + return URes; } ur_result_t MemBuffer::free() { @@ -128,8 +192,8 @@ size_t MemBuffer::getAlignment() { // usually choose a very large size (more than 1k). Then sanitizer will // allocate extra unnessary memory. Not sure if this will impact // performance. - size_t MsbIdx = 63 - __builtin_clz(Size); - size_t Alignment = (1 << (MsbIdx + 1)); + size_t MsbIdx = 63 - __builtin_clzl(Size); + size_t Alignment = (1ULL << (MsbIdx + 1)); if (Alignment > 128) { Alignment = 128; } diff --git a/source/loader/layers/sanitizer/asan_buffer.hpp b/source/loader/layers/sanitizer/asan_buffer.hpp index b4eba4e4ba..989ef4249f 100644 --- a/source/loader/layers/sanitizer/asan_buffer.hpp +++ b/source/loader/layers/sanitizer/asan_buffer.hpp @@ -48,6 +48,12 @@ struct MemBuffer { ur_context_handle_t Context; + struct Device_t { + ur_device_handle_t hDevice; + char *MemHandle; + }; + Device_t LastSyncedDevice{}; + size_t Size; char *HostPtr{}; diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp index 0deb021a3f..ba1e9e2088 100644 --- a/source/loader/layers/sanitizer/asan_interceptor.cpp +++ b/source/loader/layers/sanitizer/asan_interceptor.cpp @@ -15,178 +15,39 @@ #include "asan_options.hpp" #include "asan_quarantine.hpp" #include "asan_report.hpp" -#include "asan_shadow_setup.hpp" +#include "asan_shadow.hpp" +#include "asan_validator.hpp" #include "stacktrace.hpp" #include "ur_sanitizer_utils.hpp" namespace ur_sanitizer_layer { -namespace { - -uptr MemToShadow_CPU(uptr USM_SHADOW_BASE, uptr UPtr) { - return USM_SHADOW_BASE + (UPtr >> ASAN_SHADOW_SCALE); -} - -uptr MemToShadow_DG2(uptr USM_SHADOW_BASE, uptr UPtr) { - if (UPtr & 0xFFFF000000000000ULL) { // Device USM - return USM_SHADOW_BASE + 0x80000000000ULL + - ((UPtr & 0x7FFFFFFFFFFFULL) >> ASAN_SHADOW_SCALE); - } else { // Host/Shared USM - return USM_SHADOW_BASE + (UPtr >> ASAN_SHADOW_SCALE); - } -} - -uptr MemToShadow_PVC(uptr USM_SHADOW_BASE, uptr UPtr) { - if (UPtr & 0xFF00000000000000ULL) { // Device USM - return USM_SHADOW_BASE + 0x80000000000ULL + - ((UPtr & 0xFFFFFFFFFFFFULL) >> ASAN_SHADOW_SCALE); - } else { // Only consider 47bit VA - return USM_SHADOW_BASE + - ((UPtr & 0x7FFFFFFFFFFFULL) >> ASAN_SHADOW_SCALE); - } -} - -ur_result_t urEnqueueUSMSet(ur_queue_handle_t Queue, void *Ptr, char Value, - size_t Size, uint32_t NumEvents = 0, - const ur_event_handle_t *EventWaitList = nullptr, - ur_event_handle_t *OutEvent = nullptr) { - if (Size == 0) { - return UR_RESULT_SUCCESS; +SanitizerInterceptor::SanitizerInterceptor() { + if (getOptions().MaxQuarantineSizeMB) { + m_Quarantine = std::make_unique( + static_cast(getOptions().MaxQuarantineSizeMB) * 1024 * + 1024); } - return getContext()->urDdiTable.Enqueue.pfnUSMFill( - Queue, Ptr, 1, &Value, Size, NumEvents, EventWaitList, OutEvent); } -ur_result_t enqueueMemSetShadow(ur_context_handle_t Context, - std::shared_ptr &DeviceInfo, - ur_queue_handle_t Queue, uptr Ptr, uptr Size, - u8 Value) { - if (Size == 0) { - return UR_RESULT_SUCCESS; - } - if (DeviceInfo->Type == DeviceType::CPU) { - /// - /// CPU Device: CPU needs to use a special memset function - /// - uptr ShadowBegin = MemToShadow_CPU(DeviceInfo->ShadowOffset, Ptr); - uptr ShadowEnd = - MemToShadow_CPU(DeviceInfo->ShadowOffset, Ptr + Size - 1); - - // Poison shadow memory outside of asan runtime is not allowed, so we - // need to avoid memset's call from being intercepted. - static auto MemSet = - (void *(*)(void *, int, size_t))GetMemFunctionPointer("memset"); - if (!MemSet) { - return UR_RESULT_ERROR_UNKNOWN; - } - getContext()->logger.debug( - "enqueueMemSetShadow(addr={}, count={}, value={})", - (void *)ShadowBegin, ShadowEnd - ShadowBegin + 1, - (void *)(size_t)Value); - MemSet((void *)ShadowBegin, Value, ShadowEnd - ShadowBegin + 1); - } else { - /// - /// GPU Device: GPU needs to manually map physical memory before memset - /// - uptr ShadowBegin = 0, ShadowEnd = 0; - - if (DeviceInfo->Type == DeviceType::GPU_PVC) { - ShadowBegin = MemToShadow_PVC(DeviceInfo->ShadowOffset, Ptr); - ShadowEnd = - MemToShadow_PVC(DeviceInfo->ShadowOffset, Ptr + Size - 1); - } else if (DeviceInfo->Type == DeviceType::GPU_DG2) { - ShadowBegin = MemToShadow_DG2(DeviceInfo->ShadowOffset, Ptr); - ShadowEnd = - MemToShadow_DG2(DeviceInfo->ShadowOffset, Ptr + Size - 1); - } else { - getContext()->logger.error("Unsupport device type"); - return UR_RESULT_ERROR_INVALID_ARGUMENT; - } - - assert(ShadowBegin <= ShadowEnd); - { - static const size_t PageSize = - GetVirtualMemGranularity(Context, DeviceInfo->Handle); - - ur_physical_mem_properties_t Desc{ - UR_STRUCTURE_TYPE_PHYSICAL_MEM_PROPERTIES, nullptr, 0}; - static ur_physical_mem_handle_t PhysicalMem{}; - - // Make sure [Ptr, Ptr + Size] is mapped to physical memory - for (auto MappedPtr = RoundDownTo(ShadowBegin, PageSize); - MappedPtr <= ShadowEnd; MappedPtr += PageSize) { - if (!PhysicalMem) { - auto URes = getContext()->urDdiTable.PhysicalMem.pfnCreate( - Context, DeviceInfo->Handle, PageSize, &Desc, - &PhysicalMem); - if (URes != UR_RESULT_SUCCESS) { - getContext()->logger.error("urPhysicalMemCreate(): {}", - URes); - return URes; - } - } - - getContext()->logger.debug("urVirtualMemMap: {} ~ {}", - (void *)MappedPtr, - (void *)(MappedPtr + PageSize - 1)); - - // FIXME: No flag to check the failed reason is VA is already mapped - auto URes = getContext()->urDdiTable.VirtualMem.pfnMap( - Context, (void *)MappedPtr, PageSize, PhysicalMem, 0, - UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE); - if (URes != UR_RESULT_SUCCESS) { - getContext()->logger.debug("urVirtualMemMap({}, {}): {}", - (void *)MappedPtr, PageSize, - URes); - } - - // Initialize to zero - if (URes == UR_RESULT_SUCCESS) { - // Reset PhysicalMem to null since it's been mapped - PhysicalMem = nullptr; - - auto URes = - urEnqueueUSMSet(Queue, (void *)MappedPtr, 0, PageSize); - if (URes != UR_RESULT_SUCCESS) { - getContext()->logger.error("urEnqueueUSMFill(): {}", - URes); - return URes; - } - } - } - } - - auto URes = urEnqueueUSMSet(Queue, (void *)ShadowBegin, Value, - ShadowEnd - ShadowBegin + 1); - getContext()->logger.debug( - "enqueueMemSetShadow (addr={}, count={}, value={}): {}", - (void *)ShadowBegin, ShadowEnd - ShadowBegin + 1, - (void *)(size_t)Value, URes); - if (URes != UR_RESULT_SUCCESS) { - getContext()->logger.error("urEnqueueUSMFill(): {}", URes); - return URes; - } +SanitizerInterceptor::~SanitizerInterceptor() { + // We must release these objects before releasing adapters, since + // they may use the adapter in their destructor + for (const auto &[_, DeviceInfo] : m_DeviceMap) { + DeviceInfo->Shadow->Destory(); } - return UR_RESULT_SUCCESS; -} -} // namespace + m_Quarantine = nullptr; + m_MemBufferMap.clear(); + m_AllocationMap.clear(); + m_KernelMap.clear(); + m_ContextMap.clear(); -SanitizerInterceptor::SanitizerInterceptor(logger::Logger &logger) - : logger(logger) { - if (Options(logger).MaxQuarantineSizeMB) { - m_Quarantine = std::make_unique( - static_cast(Options(logger).MaxQuarantineSizeMB) * 1024 * - 1024); + for (auto Adapter : m_Adapters) { + getContext()->urDdiTable.Global.pfnAdapterRelease(Adapter); } } -SanitizerInterceptor::~SanitizerInterceptor() { - DestroyShadowMemoryOnCPU(); - DestroyShadowMemoryOnPVC(); - DestroyShadowMemoryOnDG2(); -} - /// The memory chunk allocated from the underlying allocator looks like this: /// L L L L L L U U U U U U R R /// L -- left redzone words (0 or more bytes) @@ -218,8 +79,8 @@ ur_result_t SanitizerInterceptor::allocateMemory( Alignment = MinAlignment; } - uptr RZLog = ComputeRZLog(Size, Options(logger).MinRZSize, - Options(logger).MaxRZSize); + uptr RZLog = + ComputeRZLog(Size, getOptions().MinRZSize, getOptions().MaxRZSize); uptr RZSize = RZLog2Size(RZLog); uptr RoundedSize = RoundUpTo(Size, Alignment); uptr NeededSize = RoundedSize + RZSize * 2; @@ -246,6 +107,9 @@ ur_result_t SanitizerInterceptor::allocateMemory( return UR_RESULT_ERROR_INVALID_ARGUMENT; } + // Udpate statistics + ContextInfo->Stats.UpdateUSMMalloced(NeededSize, NeededSize - Size); + uptr AllocBegin = reinterpret_cast(Allocated); [[maybe_unused]] uptr AllocEnd = AllocBegin + NeededSize; uptr UserBegin = AllocBegin + RZSize; @@ -300,7 +164,8 @@ ur_result_t SanitizerInterceptor::releaseMemory(ur_context_handle_t Context, } auto AllocInfoIt = *AllocInfoItOp; - auto &AllocInfo = AllocInfoIt->second; + // NOTE: AllocInfoIt will be erased later, so "AllocInfo" must be a new reference here + auto AllocInfo = AllocInfoIt->second; if (AllocInfo->Context != Context) { if (AllocInfo->UserBegin == Addr) { @@ -334,23 +199,44 @@ ur_result_t SanitizerInterceptor::releaseMemory(ur_context_handle_t Context, // If quarantine is disabled, USM is freed immediately if (!m_Quarantine) { getContext()->logger.debug("Free: {}", (void *)AllocInfo->AllocBegin); + + ContextInfo->Stats.UpdateUSMRealFreed(AllocInfo->AllocSize, + AllocInfo->getRedzoneSize()); + std::scoped_lock Guard(m_AllocationMapMutex); m_AllocationMap.erase(AllocInfoIt); + return getContext()->urDdiTable.USM.pfnFree( Context, (void *)(AllocInfo->AllocBegin)); } + // If quarantine is enabled, cache it auto ReleaseList = m_Quarantine->put(AllocInfo->Device, AllocInfoIt); if (ReleaseList.size()) { std::scoped_lock Guard(m_AllocationMapMutex); for (auto &It : ReleaseList) { getContext()->logger.info("Quarantine Free: {}", (void *)It->second->AllocBegin); + + ContextInfo->Stats.UpdateUSMRealFreed(AllocInfo->AllocSize, + AllocInfo->getRedzoneSize()); + m_AllocationMap.erase(It); + if (AllocInfo->Type == AllocType::HOST_USM) { + for (auto &Device : ContextInfo->DeviceList) { + UR_CALL(getDeviceInfo(Device)->Shadow->ReleaseShadow( + AllocInfo)); + } + } else { + UR_CALL(getDeviceInfo(AllocInfo->Device) + ->Shadow->ReleaseShadow(AllocInfo)); + } + UR_CALL(getContext()->urDdiTable.USM.pfnFree( Context, (void *)(It->second->AllocBegin))); } } + ContextInfo->Stats.UpdateUSMFreed(AllocInfo->AllocSize); return UR_RESULT_SUCCESS; } @@ -372,8 +258,8 @@ ur_result_t SanitizerInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, return UR_RESULT_ERROR_INVALID_QUEUE; } - UR_CALL( - prepareLaunch(Context, DeviceInfo, InternalQueue, Kernel, LaunchInfo)); + UR_CALL(prepareLaunch(ContextInfo, DeviceInfo, InternalQueue, Kernel, + LaunchInfo)); UR_CALL(updateShadowMemory(ContextInfo, DeviceInfo, InternalQueue)); @@ -413,18 +299,12 @@ ur_result_t SanitizerInterceptor::postLaunchKernel(ur_kernel_handle_t Kernel, } ur_result_t DeviceInfo::allocShadowMemory(ur_context_handle_t Context) { - if (Type == DeviceType::CPU) { - UR_CALL(SetupShadowMemoryOnCPU(ShadowOffset, ShadowOffsetEnd)); - } else if (Type == DeviceType::GPU_PVC) { - UR_CALL(SetupShadowMemoryOnPVC(Context, ShadowOffset, ShadowOffsetEnd)); - } else if (Type == DeviceType::GPU_DG2) { - UR_CALL(SetupShadowMemoryOnDG2(Context, ShadowOffset, ShadowOffsetEnd)); - } else { - getContext()->logger.error("Unsupport device type"); - return UR_RESULT_ERROR_INVALID_ARGUMENT; - } + Shadow = GetShadowMemory(Context, Handle, Type); + assert(Shadow && "Failed to get shadow memory"); + UR_CALL(Shadow->Setup()); getContext()->logger.info("ShadowMemory(Global): {} - {}", - (void *)ShadowOffset, (void *)ShadowOffsetEnd); + (void *)Shadow->ShadowBegin, + (void *)Shadow->ShadowEnd); return UR_RESULT_SUCCESS; } @@ -435,9 +315,10 @@ ur_result_t DeviceInfo::allocShadowMemory(ur_context_handle_t Context) { /// - 1 <= k <= 7: Only the first k bytes is accessible /// /// ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm#mapping -ur_result_t SanitizerInterceptor::enqueueAllocInfo( - ur_context_handle_t Context, std::shared_ptr &DeviceInfo, - ur_queue_handle_t Queue, std::shared_ptr &AI) { +ur_result_t +SanitizerInterceptor::enqueueAllocInfo(std::shared_ptr &DeviceInfo, + ur_queue_handle_t Queue, + std::shared_ptr &AI) { if (AI->IsReleased) { int ShadowByte; switch (AI->Type) { @@ -457,14 +338,14 @@ ur_result_t SanitizerInterceptor::enqueueAllocInfo( ShadowByte = 0xff; assert(false && "Unknow AllocInfo Type"); } - UR_CALL(enqueueMemSetShadow(Context, DeviceInfo, Queue, AI->AllocBegin, - AI->AllocSize, ShadowByte)); + UR_CALL(DeviceInfo->Shadow->EnqueuePoisonShadow( + Queue, AI->AllocBegin, AI->AllocSize, ShadowByte)); return UR_RESULT_SUCCESS; } // Init zero - UR_CALL(enqueueMemSetShadow(Context, DeviceInfo, Queue, AI->AllocBegin, - AI->AllocSize, 0)); + UR_CALL(DeviceInfo->Shadow->EnqueuePoisonShadow(Queue, AI->AllocBegin, + AI->AllocSize, 0)); uptr TailBegin = RoundUpTo(AI->UserEnd, ASAN_SHADOW_GRANULARITY); uptr TailEnd = AI->AllocBegin + AI->AllocSize; @@ -473,8 +354,8 @@ ur_result_t SanitizerInterceptor::enqueueAllocInfo( if (TailBegin != AI->UserEnd) { auto Value = AI->UserEnd - RoundDownTo(AI->UserEnd, ASAN_SHADOW_GRANULARITY); - UR_CALL(enqueueMemSetShadow(Context, DeviceInfo, Queue, AI->UserEnd, 1, - static_cast(Value))); + UR_CALL(DeviceInfo->Shadow->EnqueuePoisonShadow( + Queue, AI->UserEnd, 1, static_cast(Value))); } int ShadowByte; @@ -500,12 +381,12 @@ ur_result_t SanitizerInterceptor::enqueueAllocInfo( } // Left red zone - UR_CALL(enqueueMemSetShadow(Context, DeviceInfo, Queue, AI->AllocBegin, - AI->UserBegin - AI->AllocBegin, ShadowByte)); + UR_CALL(DeviceInfo->Shadow->EnqueuePoisonShadow( + Queue, AI->AllocBegin, AI->UserBegin - AI->AllocBegin, ShadowByte)); // Right red zone - UR_CALL(enqueueMemSetShadow(Context, DeviceInfo, Queue, TailBegin, - TailEnd - TailBegin, ShadowByte)); + UR_CALL(DeviceInfo->Shadow->EnqueuePoisonShadow( + Queue, TailBegin, TailEnd - TailBegin, ShadowByte)); return UR_RESULT_SUCCESS; } @@ -517,22 +398,58 @@ ur_result_t SanitizerInterceptor::updateShadowMemory( std::scoped_lock Guard(AllocInfos.Mutex); for (auto &AI : AllocInfos.List) { - UR_CALL(enqueueAllocInfo(ContextInfo->Handle, DeviceInfo, Queue, AI)); + UR_CALL(enqueueAllocInfo(DeviceInfo, Queue, AI)); } AllocInfos.List.clear(); return UR_RESULT_SUCCESS; } -ur_result_t -SanitizerInterceptor::registerDeviceGlobals(ur_context_handle_t Context, - ur_program_handle_t Program) { - std::vector Devices = GetProgramDevices(Program); +ur_result_t SanitizerInterceptor::registerProgram(ur_context_handle_t Context, + ur_program_handle_t Program) { + std::vector Devices = GetDevices(Program); auto ContextInfo = getContextInfo(Context); + auto ProgramInfo = getProgramInfo(Program); for (auto Device : Devices) { ManagedQueue Queue(Context, Device); + auto DeviceInfo = getDeviceInfo(Device); + + // Write global variable to program + auto EnqueueWriteGlobal = [&Queue, &Program]( + const char *Name, const void *Value, + size_t Size, bool ReportWarning = true) { + auto Result = + getContext()->urDdiTable.Enqueue.pfnDeviceGlobalVariableWrite( + Queue, Program, Name, false, Size, 0, Value, 0, nullptr, + nullptr); + if (ReportWarning && Result != UR_RESULT_SUCCESS) { + getContext()->logger.warning( + "Failed to write device global \"{}\": {}", Name, Result); + return false; + } + return true; + }; + + // Write debug + // We use "uint64_t" here because EnqueueWriteGlobal will fail when it's "uint32_t" + // Because EnqueueWriteGlobal is a async write, so + // we need to extend its lifetime + static uint64_t Debug = getOptions().Debug ? 1 : 0; + EnqueueWriteGlobal(kSPIR_AsanDebug, &Debug, sizeof(Debug), false); + + // Write shadow memory offset for global memory + EnqueueWriteGlobal(kSPIR_AsanShadowMemoryGlobalStart, + &DeviceInfo->Shadow->ShadowBegin, + sizeof(DeviceInfo->Shadow->ShadowBegin)); + EnqueueWriteGlobal(kSPIR_AsanShadowMemoryGlobalEnd, + &DeviceInfo->Shadow->ShadowEnd, + sizeof(DeviceInfo->Shadow->ShadowEnd)); + + // Write device type + EnqueueWriteGlobal(kSPIR_DeviceType, &DeviceInfo->Type, + sizeof(DeviceInfo->Type)); uint64_t NumOfDeviceGlobal; auto Result = @@ -556,7 +473,6 @@ SanitizerInterceptor::registerDeviceGlobals(ur_context_handle_t Context, return Result; } - auto DeviceInfo = getDeviceInfo(Device); for (size_t i = 0; i < NumOfDeviceGlobal; i++) { auto AI = std::make_shared( AllocInfo{GVInfos[i].Addr, @@ -571,12 +487,34 @@ SanitizerInterceptor::registerDeviceGlobals(ur_context_handle_t Context, {}}); ContextInfo->insertAllocInfo({Device}, AI); + + { + std::scoped_lock Guard( + m_AllocationMapMutex, ProgramInfo->Mutex); + ProgramInfo->AllocInfoForGlobals.emplace(AI); + m_AllocationMap.emplace(AI->AllocBegin, std::move(AI)); + } } } return UR_RESULT_SUCCESS; } +ur_result_t +SanitizerInterceptor::unregisterProgram(ur_program_handle_t Program) { + auto ProgramInfo = getProgramInfo(Program); + + std::scoped_lock Guard( + m_AllocationMapMutex, ProgramInfo->Mutex); + for (auto AI : ProgramInfo->AllocInfoForGlobals) { + UR_CALL(getDeviceInfo(AI->Device)->Shadow->ReleaseShadow(AI)); + m_AllocationMap.erase(AI->AllocBegin); + } + ProgramInfo->AllocInfoForGlobals.clear(); + + return UR_RESULT_SUCCESS; +} + ur_result_t SanitizerInterceptor::insertContext(ur_context_handle_t Context, std::shared_ptr &CI) { @@ -615,6 +553,9 @@ SanitizerInterceptor::insertDevice(ur_device_handle_t Device, DI = std::make_shared(Device); + DI->IsSupportSharedSystemUSM = GetDeviceUSMCapability( + Device, UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT); + // Query alignment UR_CALL(getContext()->urDdiTable.Device.pfnGetInfo( Device, UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN, sizeof(DI->Alignment), @@ -634,6 +575,22 @@ ur_result_t SanitizerInterceptor::eraseDevice(ur_device_handle_t Device) { return UR_RESULT_SUCCESS; } +ur_result_t SanitizerInterceptor::insertProgram(ur_program_handle_t Program) { + std::scoped_lock Guard(m_ProgramMapMutex); + if (m_ProgramMap.find(Program) != m_ProgramMap.end()) { + return UR_RESULT_SUCCESS; + } + m_ProgramMap.emplace(Program, std::make_shared(Program)); + return UR_RESULT_SUCCESS; +} + +ur_result_t SanitizerInterceptor::eraseProgram(ur_program_handle_t Program) { + std::scoped_lock Guard(m_ProgramMapMutex); + assert(m_ProgramMap.find(Program) != m_ProgramMap.end()); + m_ProgramMap.erase(Program); + return UR_RESULT_SUCCESS; +} + ur_result_t SanitizerInterceptor::insertKernel(ur_kernel_handle_t Kernel) { std::scoped_lock Guard(m_KernelMapMutex); if (m_KernelMap.find(Kernel) != m_KernelMap.end()) { @@ -677,14 +634,30 @@ SanitizerInterceptor::getMemBuffer(ur_mem_handle_t MemHandle) { } ur_result_t SanitizerInterceptor::prepareLaunch( - ur_context_handle_t Context, std::shared_ptr &DeviceInfo, - ur_queue_handle_t Queue, ur_kernel_handle_t Kernel, - USMLaunchInfo &LaunchInfo) { - auto Program = GetProgram(Kernel); + std::shared_ptr &ContextInfo, + std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, + ur_kernel_handle_t Kernel, USMLaunchInfo &LaunchInfo) { do { - // Set membuffer arguments auto KernelInfo = getKernelInfo(Kernel); + + // Validate pointer arguments + if (getOptions().DetectKernelArguments) { + for (const auto &[ArgIndex, PtrPair] : KernelInfo->PointerArgs) { + auto Ptr = PtrPair.first; + if (Ptr == nullptr) { + continue; + } + if (auto ValidateResult = ValidateUSMPointer( + ContextInfo->Handle, DeviceInfo->Handle, (uptr)Ptr)) { + ReportInvalidKernelArgument(Kernel, ArgIndex, (uptr)Ptr, + ValidateResult, PtrPair.second); + exit(1); + } + } + } + + // Set membuffer arguments for (const auto &[ArgIndex, MemBuffer] : KernelInfo->BufferArgs) { char *ArgPointer = nullptr; UR_CALL(MemBuffer->getHandle(DeviceInfo->Handle, ArgPointer)); @@ -714,41 +687,6 @@ ur_result_t SanitizerInterceptor::prepareLaunch( } } - // Write global variable to program - auto EnqueueWriteGlobal = [Queue, Program](const char *Name, - const void *Value, - size_t Size) { - auto Result = - getContext()->urDdiTable.Enqueue.pfnDeviceGlobalVariableWrite( - Queue, Program, Name, false, Size, 0, Value, 0, nullptr, - nullptr); - if (Result != UR_RESULT_SUCCESS) { - getContext()->logger.warning( - "Failed to write device global \"{}\": {}", Name, Result); - return false; - } - return true; - }; - - // Write debug - // We use "uint64_t" here because EnqueueWriteGlobal will fail when it's "uint32_t" - // Because EnqueueWriteGlobal is a async write, so - // we need to extend its lifetime - static uint64_t Debug = Options(logger).Debug ? 1 : 0; - EnqueueWriteGlobal(kSPIR_AsanDebug, &Debug, sizeof(Debug)); - - // Write shadow memory offset for global memory - EnqueueWriteGlobal(kSPIR_AsanShadowMemoryGlobalStart, - &DeviceInfo->ShadowOffset, - sizeof(DeviceInfo->ShadowOffset)); - EnqueueWriteGlobal(kSPIR_AsanShadowMemoryGlobalEnd, - &DeviceInfo->ShadowOffsetEnd, - sizeof(DeviceInfo->ShadowOffsetEnd)); - - // Write device type - EnqueueWriteGlobal(kSPIR_DeviceType, &DeviceInfo->Type, - sizeof(DeviceInfo->Type)); - if (LaunchInfo.LocalWorkSize.empty()) { LaunchInfo.LocalWorkSize.resize(LaunchInfo.WorkDim); auto URes = @@ -775,17 +713,17 @@ ur_result_t SanitizerInterceptor::prepareLaunch( LocalWorkSize[Dim]; } - auto EnqueueAllocateShadowMemory = [Context, &DeviceInfo, + auto EnqueueAllocateShadowMemory = [Context = ContextInfo->Handle, + Device = DeviceInfo->Handle, Queue](size_t Size, uptr &Ptr) { void *Allocated = nullptr; auto URes = getContext()->urDdiTable.USM.pfnDeviceAlloc( - Context, DeviceInfo->Handle, nullptr, nullptr, Size, - &Allocated); + Context, Device, nullptr, nullptr, Size, &Allocated); if (URes != UR_RESULT_SUCCESS) { return URes; } // Initialize shadow memory - URes = urEnqueueUSMSet(Queue, Allocated, 0, Size); + URes = EnqueueUSMBlockingSet(Queue, Allocated, 0, Size); if (URes != UR_RESULT_SUCCESS) { [[maybe_unused]] auto URes = getContext()->urDdiTable.USM.pfnFree(Context, Allocated); @@ -807,7 +745,7 @@ ur_result_t SanitizerInterceptor::prepareLaunch( LocalMemoryUsage, PrivateMemoryUsage); // Write shadow memory offset for local memory - if (Options(logger).DetectLocals) { + if (getOptions().DetectLocals) { // CPU needn't this if (DeviceInfo->Type == DeviceType::GPU_PVC || DeviceInfo->Type == DeviceType::GPU_DG2) { @@ -838,6 +776,9 @@ ur_result_t SanitizerInterceptor::prepareLaunch( LaunchInfo.Data->LocalShadowOffset + LocalShadowMemorySize - 1; + ContextInfo->Stats.UpdateShadowMalloced( + LocalShadowMemorySize); + getContext()->logger.info( "ShadowMemory(Local, {} - {})", (void *)LaunchInfo.Data->LocalShadowOffset, @@ -847,9 +788,10 @@ ur_result_t SanitizerInterceptor::prepareLaunch( } // Write shadow memory offset for private memory - if (Options(logger).DetectPrivates) { + if (getOptions().DetectPrivates) { if (DeviceInfo->Type == DeviceType::CPU) { - LaunchInfo.Data->PrivateShadowOffset = DeviceInfo->ShadowOffset; + LaunchInfo.Data->PrivateShadowOffset = + DeviceInfo->Shadow->ShadowBegin; } else if (DeviceInfo->Type == DeviceType::GPU_PVC || DeviceInfo->Type == DeviceType::GPU_DG2) { const size_t PrivateShadowMemorySize = @@ -875,6 +817,10 @@ ur_result_t SanitizerInterceptor::prepareLaunch( LaunchInfo.Data->PrivateShadowOffsetEnd = LaunchInfo.Data->PrivateShadowOffset + PrivateShadowMemorySize - 1; + + ContextInfo->Stats.UpdateShadowMalloced( + PrivateShadowMemorySize); + getContext()->logger.info( "ShadowMemory(Private, {} - {})", (void *)LaunchInfo.Data->PrivateShadowOffset, @@ -902,6 +848,37 @@ SanitizerInterceptor::findAllocInfoByAddress(uptr Address) { return It; } +std::vector +SanitizerInterceptor::findAllocInfoByContext(ur_context_handle_t Context) { + std::shared_lock Guard(m_AllocationMapMutex); + std::vector AllocInfos; + for (auto It = m_AllocationMap.begin(); It != m_AllocationMap.end(); It++) { + const auto &[_, AI] = *It; + if (AI->Context == Context) { + AllocInfos.emplace_back(It); + } + } + return AllocInfos; +} + +ContextInfo::~ContextInfo() { + Stats.Print(Handle); + + [[maybe_unused]] auto Result = + getContext()->urDdiTable.Context.pfnRelease(Handle); + assert(Result == UR_RESULT_SUCCESS); + + // check memory leaks + std::vector AllocInfos = + getContext()->interceptor->findAllocInfoByContext(Handle); + for (const auto &It : AllocInfos) { + const auto &[_, AI] = *It; + if (!AI->IsReleased) { + ReportMemoryLeak(AI); + } + } +} + ur_result_t USMLaunchInfo::initialize() { UR_CALL(getContext()->urDdiTable.Context.pfnRetain(Context)); UR_CALL(getContext()->urDdiTable.Device.pfnRetain(Device)); @@ -933,13 +910,19 @@ USMLaunchInfo::~USMLaunchInfo() { [[maybe_unused]] ur_result_t Result; if (Data) { auto Type = GetDeviceType(Context, Device); + auto ContextInfo = getContext()->interceptor->getContextInfo(Context); if (Type == DeviceType::GPU_PVC || Type == DeviceType::GPU_DG2) { if (Data->PrivateShadowOffset) { + ContextInfo->Stats.UpdateShadowFreed( + Data->PrivateShadowOffsetEnd - Data->PrivateShadowOffset + + 1); Result = getContext()->urDdiTable.USM.pfnFree( Context, (void *)Data->PrivateShadowOffset); assert(Result == UR_RESULT_SUCCESS); } if (Data->LocalShadowOffset) { + ContextInfo->Stats.UpdateShadowFreed( + Data->LocalShadowOffsetEnd - Data->LocalShadowOffset + 1); Result = getContext()->urDdiTable.USM.pfnFree( Context, (void *)Data->LocalShadowOffset); assert(Result == UR_RESULT_SUCCESS); diff --git a/source/loader/layers/sanitizer/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan_interceptor.hpp index 39c7705c99..e5429acd56 100644 --- a/source/loader/layers/sanitizer/asan_interceptor.hpp +++ b/source/loader/layers/sanitizer/asan_interceptor.hpp @@ -15,6 +15,9 @@ #include "asan_allocator.hpp" #include "asan_buffer.hpp" #include "asan_libdevice.hpp" +#include "asan_options.hpp" +#include "asan_shadow.hpp" +#include "asan_statistics.hpp" #include "common.hpp" #include "ur_sanitizer_layer.hpp" @@ -22,6 +25,7 @@ #include #include #include +#include #include namespace ur_sanitizer_layer { @@ -38,24 +42,19 @@ struct DeviceInfo { DeviceType Type = DeviceType::UNKNOWN; size_t Alignment = 0; - uptr ShadowOffset = 0; - uptr ShadowOffsetEnd = 0; + std::shared_ptr Shadow; + // Device features + bool IsSupportSharedSystemUSM = false; + + // lock this mutex if following fields are accessed ur_mutex Mutex; std::queue> Quarantine; size_t QuarantineSize = 0; - explicit DeviceInfo(ur_device_handle_t Device) : Handle(Device) { - [[maybe_unused]] auto Result = - getContext()->urDdiTable.Device.pfnRetain(Device); - assert(Result == UR_RESULT_SUCCESS); - } - - ~DeviceInfo() { - [[maybe_unused]] auto Result = - getContext()->urDdiTable.Device.pfnRelease(Handle); - assert(Result == UR_RESULT_SUCCESS); - } + // Device handles are special and alive in the whole process lifetime, + // so we needn't retain&release here. + explicit DeviceInfo(ur_device_handle_t Device) : Handle(Device) {} ur_result_t allocShadowMemory(ur_context_handle_t Context); }; @@ -63,6 +62,7 @@ struct DeviceInfo { struct QueueInfo { ur_queue_handle_t Handle; + // lock this mutex if following fields are accessed ur_shared_mutex Mutex; ur_event_handle_t LastEvent; @@ -82,9 +82,13 @@ struct QueueInfo { struct KernelInfo { ur_kernel_handle_t Handle; - ur_shared_mutex Mutex; std::atomic RefCount = 1; + + // lock this mutex if following fields are accessed + ur_shared_mutex Mutex; std::unordered_map> BufferArgs; + std::unordered_map> + PointerArgs; // Need preserve the order of local arguments std::map LocalArgs; @@ -102,23 +106,43 @@ struct KernelInfo { } }; +struct ProgramInfo { + ur_program_handle_t Handle; + std::atomic RefCount = 1; + + // lock this mutex if following fields are accessed + ur_shared_mutex Mutex; + std::unordered_set> AllocInfoForGlobals; + + explicit ProgramInfo(ur_program_handle_t Program) : Handle(Program) { + [[maybe_unused]] auto Result = + getContext()->urDdiTable.Program.pfnRetain(Handle); + assert(Result == UR_RESULT_SUCCESS); + } + + ~ProgramInfo() { + [[maybe_unused]] auto Result = + getContext()->urDdiTable.Program.pfnRelease(Handle); + assert(Result == UR_RESULT_SUCCESS); + } +}; + struct ContextInfo { ur_context_handle_t Handle; + std::atomic RefCount = 1; std::vector DeviceList; std::unordered_map AllocInfosMap; + AsanStatsWrapper Stats; + explicit ContextInfo(ur_context_handle_t Context) : Handle(Context) { [[maybe_unused]] auto Result = getContext()->urDdiTable.Context.pfnRetain(Context); assert(Result == UR_RESULT_SUCCESS); } - ~ContextInfo() { - [[maybe_unused]] auto Result = - getContext()->urDdiTable.Context.pfnRelease(Handle); - assert(Result == UR_RESULT_SUCCESS); - } + ~ContextInfo(); void insertAllocInfo(const std::vector &Devices, std::shared_ptr &AI) { @@ -164,7 +188,7 @@ struct DeviceGlobalInfo { class SanitizerInterceptor { public: - explicit SanitizerInterceptor(logger::Logger &logger); + explicit SanitizerInterceptor(); ~SanitizerInterceptor(); @@ -175,8 +199,10 @@ class SanitizerInterceptor { AllocType Type, void **ResultPtr); ur_result_t releaseMemory(ur_context_handle_t Context, void *Ptr); - ur_result_t registerDeviceGlobals(ur_context_handle_t Context, - ur_program_handle_t Program); + ur_result_t registerProgram(ur_context_handle_t Context, + ur_program_handle_t Program); + + ur_result_t unregisterProgram(ur_program_handle_t Program); ur_result_t preLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, @@ -194,6 +220,9 @@ class SanitizerInterceptor { std::shared_ptr &CI); ur_result_t eraseDevice(ur_device_handle_t Device); + ur_result_t insertProgram(ur_program_handle_t Program); + ur_result_t eraseProgram(ur_program_handle_t Program); + ur_result_t insertKernel(ur_kernel_handle_t Kernel); ur_result_t eraseKernel(ur_kernel_handle_t Kernel); @@ -201,8 +230,21 @@ class SanitizerInterceptor { ur_result_t eraseMemBuffer(ur_mem_handle_t MemHandle); std::shared_ptr getMemBuffer(ur_mem_handle_t MemHandle); + ur_result_t holdAdapter(ur_adapter_handle_t Adapter) { + std::scoped_lock Guard(m_AdaptersMutex); + if (m_Adapters.find(Adapter) != m_Adapters.end()) { + return UR_RESULT_SUCCESS; + } + UR_CALL(getContext()->urDdiTable.Global.pfnAdapterRetain(Adapter)); + m_Adapters.insert(Adapter); + return UR_RESULT_SUCCESS; + } + std::optional findAllocInfoByAddress(uptr Address); + std::vector + findAllocInfoByContext(ur_context_handle_t Context); + std::shared_ptr getContextInfo(ur_context_handle_t Context) { std::shared_lock Guard(m_ContextMapMutex); assert(m_ContextMap.find(Context) != m_ContextMap.end()); @@ -215,23 +257,31 @@ class SanitizerInterceptor { return m_DeviceMap[Device]; } + std::shared_ptr getProgramInfo(ur_program_handle_t Program) { + std::shared_lock Guard(m_ProgramMapMutex); + assert(m_ProgramMap.find(Program) != m_ProgramMap.end()); + return m_ProgramMap[Program]; + } + std::shared_ptr getKernelInfo(ur_kernel_handle_t Kernel) { std::shared_lock Guard(m_KernelMapMutex); assert(m_KernelMap.find(Kernel) != m_KernelMap.end()); return m_KernelMap[Kernel]; } + const AsanOptions &getOptions() { return m_Options; } + private: ur_result_t updateShadowMemory(std::shared_ptr &ContextInfo, std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue); - ur_result_t enqueueAllocInfo(ur_context_handle_t Context, - std::shared_ptr &DeviceInfo, + + ur_result_t enqueueAllocInfo(std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, std::shared_ptr &AI); /// Initialize Global Variables & Kernel Name at first Launch - ur_result_t prepareLaunch(ur_context_handle_t Context, + ur_result_t prepareLaunch(std::shared_ptr &ContextInfo, std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, ur_kernel_handle_t Kernel, @@ -248,6 +298,10 @@ class SanitizerInterceptor { m_DeviceMap; ur_shared_mutex m_DeviceMapMutex; + std::unordered_map> + m_ProgramMap; + ur_shared_mutex m_ProgramMapMutex; + std::unordered_map> m_KernelMap; ur_shared_mutex m_KernelMapMutex; @@ -261,7 +315,11 @@ class SanitizerInterceptor { ur_shared_mutex m_AllocationMapMutex; std::unique_ptr m_Quarantine; - logger::Logger &logger; + + AsanOptions m_Options; + + std::unordered_set m_Adapters; + ur_shared_mutex m_AdaptersMutex; }; } // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/asan_options.cpp b/source/loader/layers/sanitizer/asan_options.cpp new file mode 100644 index 0000000000..5c42ab8fca --- /dev/null +++ b/source/loader/layers/sanitizer/asan_options.cpp @@ -0,0 +1,142 @@ +/* + * + * Copyright (C) 2024 Intel Corporation + * + * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. + * See LICENSE.TXT + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * @file asan_options.cpp + * + */ + +#include "asan_options.hpp" +#include "ur_sanitizer_layer.hpp" + +#include +#include +#include + +namespace ur_sanitizer_layer { + +AsanOptions::AsanOptions() { + std::optional OptionsEnvMap; + try { + OptionsEnvMap = getenv_to_map("UR_LAYER_ASAN_OPTIONS"); + } catch (const std::invalid_argument &e) { + std::stringstream SS; + SS << "[ERROR]: "; + SS << e.what(); + getContext()->logger.always(SS.str().c_str()); + die("Sanitizer failed to parse options.\n"); + } + + if (!OptionsEnvMap.has_value()) { + return; + } + + const char *TrueStrings[] = {"1", "true"}; + const char *FalseStrings[] = {"0", "false"}; + + auto InplaceToLower = [](std::string &S) { + std::transform(S.begin(), S.end(), S.begin(), + [](unsigned char C) { return std::tolower(C); }); + }; + auto IsTrue = [&](const std::string &S) { + return std::any_of(std::begin(TrueStrings), std::end(TrueStrings), + [&](const char *CS) { return S == CS; }); + }; + auto IsFalse = [&](const std::string &S) { + return std::any_of(std::begin(FalseStrings), std::end(FalseStrings), + [&](const char *CS) { return S == CS; }); + }; + + auto SetBoolOption = [&](const std::string &Name, bool &Opt) { + auto KV = OptionsEnvMap->find(Name); + if (KV != OptionsEnvMap->end()) { + auto Value = KV->second.front(); + InplaceToLower(Value); + if (IsTrue(Value)) { + Opt = true; + } else if (IsFalse(Value)) { + Opt = false; + } else { + std::stringstream SS; + SS << "\"" << Name << "\" is set to \"" << Value + << "\", which is not an valid setting. "; + SS << "Acceptable input are: for enable, use:"; + for (auto &S : TrueStrings) { + SS << " \"" << S << "\""; + } + SS << "; "; + SS << "for disable, use:"; + for (auto &S : FalseStrings) { + SS << " \"" << S << "\""; + } + SS << "."; + getContext()->logger.error(SS.str().c_str()); + die("Sanitizer failed to parse options.\n"); + } + } + }; + + SetBoolOption("debug", Debug); + SetBoolOption("detect_kernel_arguments", DetectKernelArguments); + SetBoolOption("detect_locals", DetectLocals); + SetBoolOption("detect_privates", DetectPrivates); + SetBoolOption("print_stats", PrintStats); + + auto KV = OptionsEnvMap->find("quarantine_size_mb"); + if (KV != OptionsEnvMap->end()) { + const auto &Value = KV->second.front(); + try { + auto temp_long = std::stoul(Value); + if (temp_long > UINT32_MAX) { + throw std::out_of_range(""); + } + MaxQuarantineSizeMB = temp_long; + } catch (...) { + getContext()->logger.error("\"quarantine_size_mb\" should be " + "an integer in range[0, {}].", + UINT32_MAX); + die("Sanitizer failed to parse options.\n"); + } + } + + KV = OptionsEnvMap->find("redzone"); + if (KV != OptionsEnvMap->end()) { + const auto &Value = KV->second.front(); + try { + MinRZSize = std::stoul(Value); + if (MinRZSize < 16) { + MinRZSize = 16; + getContext()->logger.warning("Trying to set redzone size to a " + "value less than 16 is ignored."); + } + } catch (...) { + getContext()->logger.error( + "\"redzone\" should be an integer in range[0, 16]."); + die("Sanitizer failed to parse options.\n"); + } + } + + KV = OptionsEnvMap->find("max_redzone"); + if (KV != OptionsEnvMap->end()) { + const auto &Value = KV->second.front(); + try { + MaxRZSize = std::stoul(Value); + if (MaxRZSize > 2048) { + MaxRZSize = 2048; + getContext()->logger.warning( + "Trying to set max redzone size to a " + "value greater than 2048 is ignored."); + } + } catch (...) { + getContext()->logger.error( + "\"max_redzone\" should be an integer in range[0, 2048]."); + die("Sanitizer failed to parse options.\n"); + } + } +} + +} // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/asan_options.hpp b/source/loader/layers/sanitizer/asan_options.hpp index ab6ee0c26b..4c515e28fe 100644 --- a/source/loader/layers/sanitizer/asan_options.hpp +++ b/source/loader/layers/sanitizer/asan_options.hpp @@ -12,138 +12,21 @@ #pragma once -#include "common/ur_util.hpp" -#include "ur/ur.hpp" -#include "ur_sanitizer_layer.hpp" - -#include -#include -#include +#include "common.hpp" namespace ur_sanitizer_layer { struct AsanOptions { - public: - AsanOptions(AsanOptions &other) = delete; - void operator=(const AsanOptions &) = delete; - - static AsanOptions &getInstance(logger::Logger &logger) { - static AsanOptions instance(logger); - return instance; - } - bool Debug = false; uint64_t MinRZSize = 16; uint64_t MaxRZSize = 2048; - uint32_t MaxQuarantineSizeMB = 0; + uint32_t MaxQuarantineSizeMB = 8; bool DetectLocals = true; bool DetectPrivates = true; + bool PrintStats = false; + bool DetectKernelArguments = true; - private: - AsanOptions(logger::Logger &logger) { - auto OptionsEnvMap = getenv_to_map("UR_LAYER_ASAN_OPTIONS"); - if (!OptionsEnvMap.has_value()) { - return; - } - - const char *TrueStrings[] = {"1", "true"}; - const char *FalseStrings[] = {"0", "false"}; - - auto InplaceToLower = [](std::string &S) { - std::transform(S.begin(), S.end(), S.begin(), - [](unsigned char C) { return std::tolower(C); }); - }; - auto IsTrue = [&](const std::string &S) { - return std::any_of(std::begin(TrueStrings), std::end(TrueStrings), - [&](const char *CS) { return S == CS; }); - }; - auto IsFalse = [&](const std::string &S) { - return std::any_of(std::begin(FalseStrings), std::end(FalseStrings), - [&](const char *CS) { return S == CS; }); - }; - - auto SetBoolOption = [&](const std::string &Name, bool &Opt) { - auto KV = OptionsEnvMap->find(Name); - if (KV != OptionsEnvMap->end()) { - auto Value = KV->second.front(); - InplaceToLower(Value); - if (IsTrue(Value)) { - Opt = true; - } else if (IsFalse(Value)) { - Opt = false; - } else { - std::stringstream SS; - SS << "[ERROR]: \"" << Name << "\" is set to \"" - << Value << "\", which is not an valid setting. "; - SS << "Acceptable input are: for enable, use:"; - for (auto &S : TrueStrings) { - SS << " \"" << S << "\""; - } - SS << "; "; - SS << "for disable, use:"; - for (auto &S : FalseStrings) { - SS << " \"" << S << "\""; - } - SS << "."; - die(SS.str().c_str()); - } - } - }; - - SetBoolOption("debug", Debug); - SetBoolOption("detect_locals", DetectLocals); - SetBoolOption("detect_privates", DetectPrivates); - - auto KV = OptionsEnvMap->find("quarantine_size_mb"); - if (KV != OptionsEnvMap->end()) { - auto Value = KV->second.front(); - try { - auto temp_long = std::stoul(Value); - if (temp_long > UINT32_MAX) { - throw std::out_of_range(""); - } - MaxQuarantineSizeMB = temp_long; - } catch (...) { - die("[ERROR]: \"quarantine_size_mb\" should be " - "an positive integer that smaller than or equal to " - "4294967295."); - } - } - - KV = OptionsEnvMap->find("redzone"); - if (KV != OptionsEnvMap->end()) { - auto Value = KV->second.front(); - try { - MinRZSize = std::stoul(Value); - if (MinRZSize < 16) { - MinRZSize = 16; - logger.warning("Trying to set redzone size to a " - "value less than 16 is ignored"); - } - } catch (...) { - die("[ERROR]: \"redzone\" should be an integer"); - } - } - - KV = OptionsEnvMap->find("max_redzone"); - if (KV != OptionsEnvMap->end()) { - auto Value = KV->second.front(); - try { - MaxRZSize = std::stoul(Value); - if (MaxRZSize > 2048) { - MaxRZSize = 2048; - logger.warning("Trying to set max redzone size to a " - "value greater than 2048 is ignored"); - } - } catch (...) { - die("[ERROR]: \"max_redzone\" should be an integer"); - } - } - } + explicit AsanOptions(); }; -inline const AsanOptions &Options(logger::Logger &logger) { - return AsanOptions::getInstance(logger); -} - } // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/asan_report.cpp b/source/loader/layers/sanitizer/asan_report.cpp index bdae3284b4..c1a1230e78 100644 --- a/source/loader/layers/sanitizer/asan_report.cpp +++ b/source/loader/layers/sanitizer/asan_report.cpp @@ -11,16 +11,32 @@ */ #include "asan_report.hpp" -#include "asan_options.hpp" - #include "asan_allocator.hpp" #include "asan_interceptor.hpp" #include "asan_libdevice.hpp" +#include "asan_options.hpp" +#include "asan_validator.hpp" #include "ur_sanitizer_layer.hpp" #include "ur_sanitizer_utils.hpp" namespace ur_sanitizer_layer { +namespace { + +void PrintAllocateInfo(uptr Addr, const AllocInfo *AI) { + getContext()->logger.always("{} is located inside of {} region [{}, {})", + (void *)Addr, ToString(AI->Type), + (void *)AI->UserBegin, (void *)AI->UserEnd); + getContext()->logger.always("allocated here:"); + AI->AllocStack.print(); + if (AI->IsReleased) { + getContext()->logger.always("freed here:"); + AI->ReleaseStack.print(); + } +} + +} // namespace + void ReportBadFree(uptr Addr, const StackTrace &stack, const std::shared_ptr &AI) { getContext()->logger.always( @@ -32,13 +48,9 @@ void ReportBadFree(uptr Addr, const StackTrace &stack, (void *)Addr); } - assert(!AI->IsReleased && "Chunk must be not released"); + assert(AI && !AI->IsReleased && "Chunk must be not released"); - getContext()->logger.always("{} is located inside of {} region [{}, {})", - (void *)Addr, ToString(AI->Type), - (void *)AI->UserBegin, (void *)AI->UserEnd); - getContext()->logger.always("allocated here:"); - AI->AllocStack.print(); + PrintAllocateInfo(Addr, AI.get()); } void ReportBadContext(uptr Addr, const StackTrace &stack, @@ -48,16 +60,7 @@ void ReportBadContext(uptr Addr, const StackTrace &stack, (void *)Addr); stack.print(); - getContext()->logger.always("{} is located inside of {} region [{}, {})", - (void *)Addr, ToString(AI->Type), - (void *)AI->UserBegin, (void *)AI->UserEnd); - getContext()->logger.always("allocated here:"); - AI->AllocStack.print(); - - if (AI->IsReleased) { - getContext()->logger.always("freed here:"); - AI->ReleaseStack.print(); - } + PrintAllocateInfo(Addr, AI.get()); } void ReportDoubleFree(uptr Addr, const StackTrace &Stack, @@ -76,6 +79,16 @@ void ReportDoubleFree(uptr Addr, const StackTrace &Stack, AI->AllocStack.print(); } +void ReportMemoryLeak(const std::shared_ptr &AI) { + getContext()->logger.always( + "\n====ERROR: DeviceSanitizer: detected memory leaks of {}", + ToString(AI->Type)); + getContext()->logger.always( + "Direct leak of {} byte(s) at {} allocated from:", + AI->UserEnd - AI->UserBegin, (void *)AI->UserBegin); + AI->AllocStack.print(); +} + void ReportFatalError(const DeviceSanitizerReport &Report) { getContext()->logger.always("\n====ERROR: DeviceSanitizer: {}", ToString(Report.ErrorType)); @@ -124,7 +137,7 @@ void ReportUseAfterFree(const DeviceSanitizerReport &Report, getContext()->logger.always(" #0 {} {}:{}", Func, File, Report.Line); getContext()->logger.always(""); - if (Options(getContext()->logger).MaxQuarantineSizeMB > 0) { + if (getContext()->interceptor->getOptions().MaxQuarantineSizeMB > 0) { auto AllocInfoItOp = getContext()->interceptor->findAllocInfoByAddress(Report.Address); @@ -139,16 +152,10 @@ void ReportUseAfterFree(const DeviceSanitizerReport &Report, "Failed to find which chunck {} is allocated", (void *)Report.Address); } - assert(AllocInfo->IsReleased); + assert(AllocInfo->IsReleased && + "It must be released since it's use-after-free"); - getContext()->logger.always( - "{} is located inside of {} region [{}, {})", - (void *)Report.Address, ToString(AllocInfo->Type), - (void *)AllocInfo->UserBegin, (void *)AllocInfo->UserEnd); - getContext()->logger.always("allocated here:"); - AllocInfo->AllocStack.print(); - getContext()->logger.always("released here:"); - AllocInfo->ReleaseStack.print(); + PrintAllocateInfo(Report.Address, AllocInfo.get()); } } else { getContext()->logger.always( @@ -157,4 +164,49 @@ void ReportUseAfterFree(const DeviceSanitizerReport &Report, } } +void ReportInvalidKernelArgument(ur_kernel_handle_t Kernel, uint32_t ArgIndex, + uptr Addr, const ValidateUSMResult &VR, + StackTrace Stack) { + getContext()->logger.always("\n====ERROR: DeviceSanitizer: " + "invalid-argument on kernel <{}>", + DemangleName(GetKernelName(Kernel))); + Stack.print(); + auto &AI = VR.AI; + ArgIndex = ArgIndex + 1; + switch (VR.Type) { + case ValidateUSMResult::MAYBE_HOST_POINTER: + getContext()->logger.always("The {}th argument {} is not a USM pointer", + ArgIndex, (void *)Addr); + break; + case ValidateUSMResult::RELEASED_POINTER: + getContext()->logger.always( + "The {}th argument {} is a released USM pointer", ArgIndex + 1, + (void *)Addr); + PrintAllocateInfo(Addr, AI.get()); + break; + case ValidateUSMResult::BAD_CONTEXT: + getContext()->logger.always( + "The {}th argument {} is allocated in other context", ArgIndex + 1, + (void *)Addr); + PrintAllocateInfo(Addr, AI.get()); + break; + case ValidateUSMResult::BAD_DEVICE: + getContext()->logger.always( + "The {}th argument {} is allocated in other device", ArgIndex + 1, + (void *)Addr); + PrintAllocateInfo(Addr, AI.get()); + break; + case ValidateUSMResult::OUT_OF_BOUNDS: + getContext()->logger.always( + "The {}th argument {} is located outside of its region [{}, {})", + ArgIndex + 1, (void *)Addr, (void *)AI->UserBegin, + (void *)AI->UserEnd); + getContext()->logger.always("allocated here:"); + AI->AllocStack.print(); + break; + default: + break; + } +} + } // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/asan_report.hpp b/source/loader/layers/sanitizer/asan_report.hpp index 77a182b0e6..e679b30c5d 100644 --- a/source/loader/layers/sanitizer/asan_report.hpp +++ b/source/loader/layers/sanitizer/asan_report.hpp @@ -21,6 +21,7 @@ namespace ur_sanitizer_layer { struct DeviceSanitizerReport; struct AllocInfo; struct StackTrace; +struct ValidateUSMResult; void ReportBadFree(uptr Addr, const StackTrace &stack, const std::shared_ptr &AllocInfo); @@ -31,7 +32,10 @@ void ReportBadContext(uptr Addr, const StackTrace &stack, void ReportDoubleFree(uptr Addr, const StackTrace &Stack, const std::shared_ptr &AllocInfo); -// This type of error is usually unexpected mistake and doesn't have enough debug information +void ReportMemoryLeak(const std::shared_ptr &AI); + +// This type of error is usually unexpected mistake and doesn't have enough +// debug information void ReportFatalError(const DeviceSanitizerReport &Report); void ReportGenericError(const DeviceSanitizerReport &Report, @@ -40,4 +44,8 @@ void ReportGenericError(const DeviceSanitizerReport &Report, void ReportUseAfterFree(const DeviceSanitizerReport &Report, ur_kernel_handle_t Kernel, ur_context_handle_t Context); +void ReportInvalidKernelArgument(ur_kernel_handle_t Kernel, uint32_t ArgIndex, + uptr Addr, const ValidateUSMResult &VR, + StackTrace Stack); + } // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/asan_shadow.cpp b/source/loader/layers/sanitizer/asan_shadow.cpp new file mode 100644 index 0000000000..1f3ae18986 --- /dev/null +++ b/source/loader/layers/sanitizer/asan_shadow.cpp @@ -0,0 +1,270 @@ +/* + * + * Copyright (C) 2024 Intel Corporation + * + * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. + * See LICENSE.TXT + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * @file asan_shadow.cpp + * + */ + +#include "asan_shadow.hpp" +#include "asan_interceptor.hpp" +#include "asan_libdevice.hpp" +#include "ur_sanitizer_layer.hpp" +#include "ur_sanitizer_utils.hpp" + +namespace ur_sanitizer_layer { + +std::shared_ptr GetShadowMemory(ur_context_handle_t Context, + ur_device_handle_t Device, + DeviceType Type) { + if (Type == DeviceType::CPU) { + static std::shared_ptr ShadowCPU = + std::make_shared(Context, Device); + return ShadowCPU; + } else if (Type == DeviceType::GPU_PVC) { + static std::shared_ptr ShadowPVC = + std::make_shared(Context, Device); + return ShadowPVC; + } else if (Type == DeviceType::GPU_DG2) { + static std::shared_ptr ShadowDG2 = + std::make_shared(Context, Device); + return ShadowDG2; + } else { + getContext()->logger.error("Unsupport device type"); + return nullptr; + } +} + +ur_result_t ShadowMemoryCPU::Setup() { + static ur_result_t Result = [this]() { + size_t ShadowSize = GetShadowSize(); + ShadowBegin = MmapNoReserve(0, ShadowSize); + if (ShadowBegin == 0) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + DontCoredumpRange(ShadowBegin, ShadowSize); + ShadowEnd = ShadowBegin + ShadowSize; + + // Set shadow memory for null pointer + auto URes = EnqueuePoisonShadow({}, 0, 1, kNullPointerRedzoneMagic); + if (URes != UR_RESULT_SUCCESS) { + getContext()->logger.error("EnqueuePoisonShadow(NullPointerRZ): {}", + URes); + return URes; + } + return URes; + }(); + return Result; +} + +ur_result_t ShadowMemoryCPU::Destory() { + if (ShadowBegin == 0) { + return UR_RESULT_SUCCESS; + } + static ur_result_t Result = [this]() { + if (!Munmap(ShadowBegin, GetShadowSize())) { + return UR_RESULT_ERROR_UNKNOWN; + } + return UR_RESULT_SUCCESS; + }(); + return Result; +} + +uptr ShadowMemoryCPU::MemToShadow(uptr Ptr) { + return ShadowBegin + (Ptr >> ASAN_SHADOW_SCALE); +} + +ur_result_t ShadowMemoryCPU::EnqueuePoisonShadow(ur_queue_handle_t, uptr Ptr, + uptr Size, u8 Value) { + if (Size == 0) { + return UR_RESULT_SUCCESS; + } + + uptr ShadowBegin = MemToShadow(Ptr); + uptr ShadowEnd = MemToShadow(Ptr + Size - 1); + assert(ShadowBegin <= ShadowEnd); + getContext()->logger.debug( + "EnqueuePoisonShadow(addr={}, count={}, value={})", (void *)ShadowBegin, + ShadowEnd - ShadowBegin + 1, (void *)(size_t)Value); + memset((void *)ShadowBegin, Value, ShadowEnd - ShadowBegin + 1); + + return UR_RESULT_SUCCESS; +} + +ur_result_t ShadowMemoryGPU::Setup() { + // Currently, Level-Zero doesn't create independent VAs for each contexts, if we reserve + // shadow memory for each contexts, this will cause out-of-resource error when user uses + // multiple contexts. Therefore, we just create one shadow memory here. + static ur_result_t Result = [this]() { + size_t ShadowSize = GetShadowSize(); + // TODO: Protect Bad Zone + auto Result = getContext()->urDdiTable.VirtualMem.pfnReserve( + Context, nullptr, ShadowSize, (void **)&ShadowBegin); + if (Result == UR_RESULT_SUCCESS) { + ShadowEnd = ShadowBegin + ShadowSize; + // Retain the context which reserves shadow memory + getContext()->urDdiTable.Context.pfnRetain(Context); + } + + // Set shadow memory for null pointer + ManagedQueue Queue(Context, Device); + + Result = EnqueuePoisonShadow(Queue, 0, 1, kNullPointerRedzoneMagic); + if (Result != UR_RESULT_SUCCESS) { + getContext()->logger.error("EnqueuePoisonShadow(NullPointerRZ): {}", + Result); + return Result; + } + return Result; + }(); + return Result; +} + +ur_result_t ShadowMemoryGPU::Destory() { + if (ShadowBegin == 0) { + return UR_RESULT_SUCCESS; + } + static ur_result_t Result = [this]() { + auto Result = getContext()->urDdiTable.VirtualMem.pfnFree( + Context, (const void *)ShadowBegin, GetShadowSize()); + getContext()->urDdiTable.Context.pfnRelease(Context); + return Result; + }(); + return Result; +} + +ur_result_t ShadowMemoryGPU::EnqueuePoisonShadow(ur_queue_handle_t Queue, + uptr Ptr, uptr Size, + u8 Value) { + if (Size == 0) { + return UR_RESULT_SUCCESS; + } + + uptr ShadowBegin = MemToShadow(Ptr); + uptr ShadowEnd = MemToShadow(Ptr + Size - 1); + assert(ShadowBegin <= ShadowEnd); + { + static const size_t PageSize = + GetVirtualMemGranularity(Context, Device); + + ur_physical_mem_properties_t Desc{ + UR_STRUCTURE_TYPE_PHYSICAL_MEM_PROPERTIES, nullptr, 0}; + + // Make sure [Ptr, Ptr + Size] is mapped to physical memory + for (auto MappedPtr = RoundDownTo(ShadowBegin, PageSize); + MappedPtr <= ShadowEnd; MappedPtr += PageSize) { + std::scoped_lock Guard(VirtualMemMapsMutex); + if (VirtualMemMaps.find(MappedPtr) == VirtualMemMaps.end()) { + ur_physical_mem_handle_t PhysicalMem{}; + auto URes = getContext()->urDdiTable.PhysicalMem.pfnCreate( + Context, Device, PageSize, &Desc, &PhysicalMem); + if (URes != UR_RESULT_SUCCESS) { + getContext()->logger.error("urPhysicalMemCreate(): {}", + URes); + return URes; + } + + URes = getContext()->urDdiTable.VirtualMem.pfnMap( + Context, (void *)MappedPtr, PageSize, PhysicalMem, 0, + UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE); + if (URes != UR_RESULT_SUCCESS) { + getContext()->logger.error("urVirtualMemMap({}, {}): {}", + (void *)MappedPtr, PageSize, + URes); + return URes; + } + + getContext()->logger.debug("urVirtualMemMap: {} ~ {}", + (void *)MappedPtr, + (void *)(MappedPtr + PageSize - 1)); + + // Initialize to zero + URes = EnqueueUSMBlockingSet(Queue, (void *)MappedPtr, 0, + PageSize); + if (URes != UR_RESULT_SUCCESS) { + getContext()->logger.error("EnqueueUSMBlockingSet(): {}", + URes); + return URes; + } + + VirtualMemMaps[MappedPtr].first = PhysicalMem; + } + + // We don't need to record virtual memory map for null pointer, + // since it doesn't have an alloc info. + if (Ptr == 0) { + continue; + } + + auto AllocInfoIt = + getContext()->interceptor->findAllocInfoByAddress(Ptr); + assert(AllocInfoIt); + VirtualMemMaps[MappedPtr].second.insert((*AllocInfoIt)->second); + } + } + + auto URes = EnqueueUSMBlockingSet(Queue, (void *)ShadowBegin, Value, + ShadowEnd - ShadowBegin + 1); + getContext()->logger.debug( + "EnqueuePoisonShadow (addr={}, count={}, value={}): {}", + (void *)ShadowBegin, ShadowEnd - ShadowBegin + 1, (void *)(size_t)Value, + URes); + if (URes != UR_RESULT_SUCCESS) { + getContext()->logger.error("EnqueueUSMBlockingSet(): {}", URes); + return URes; + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ShadowMemoryGPU::ReleaseShadow(std::shared_ptr AI) { + uptr ShadowBegin = MemToShadow(AI->AllocBegin); + uptr ShadowEnd = MemToShadow(AI->AllocBegin + AI->AllocSize); + assert(ShadowBegin <= ShadowEnd); + + static const size_t PageSize = GetVirtualMemGranularity(Context, Device); + + for (auto MappedPtr = RoundDownTo(ShadowBegin, PageSize); + MappedPtr <= ShadowEnd; MappedPtr += PageSize) { + std::scoped_lock Guard(VirtualMemMapsMutex); + if (VirtualMemMaps.find(MappedPtr) == VirtualMemMaps.end()) { + continue; + } + VirtualMemMaps[MappedPtr].second.erase(AI); + if (VirtualMemMaps[MappedPtr].second.empty()) { + UR_CALL(getContext()->urDdiTable.VirtualMem.pfnUnmap( + Context, (void *)MappedPtr, PageSize)); + UR_CALL(getContext()->urDdiTable.PhysicalMem.pfnRelease( + VirtualMemMaps[MappedPtr].first)); + getContext()->logger.debug("urVirtualMemUnmap: {} ~ {}", + (void *)MappedPtr, + (void *)(MappedPtr + PageSize - 1)); + } + } + + return UR_RESULT_SUCCESS; +} + +uptr ShadowMemoryPVC::MemToShadow(uptr Ptr) { + if (Ptr & 0xFF00000000000000ULL) { // Device USM + return ShadowBegin + 0x80000000000ULL + + ((Ptr & 0xFFFFFFFFFFFFULL) >> ASAN_SHADOW_SCALE); + } else { // Only consider 47bit VA + return ShadowBegin + ((Ptr & 0x7FFFFFFFFFFFULL) >> ASAN_SHADOW_SCALE); + } +} + +uptr ShadowMemoryDG2::MemToShadow(uptr Ptr) { + if (Ptr & 0xFFFF000000000000ULL) { // Device USM + return ShadowBegin + 0x80000000000ULL + + ((Ptr & 0x7FFFFFFFFFFFULL) >> ASAN_SHADOW_SCALE); + } else { // Host/Shared USM + return ShadowBegin + (Ptr >> ASAN_SHADOW_SCALE); + } +} + +} // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/asan_shadow.hpp b/source/loader/layers/sanitizer/asan_shadow.hpp new file mode 100644 index 0000000000..7ae095062a --- /dev/null +++ b/source/loader/layers/sanitizer/asan_shadow.hpp @@ -0,0 +1,135 @@ +/* + * + * Copyright (C) 2024 Intel Corporation + * + * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. + * See LICENSE.TXT + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * @file asan_shadow.hpp + * + */ + +#pragma once + +#include "asan_allocator.hpp" +#include "common.hpp" +#include + +namespace ur_sanitizer_layer { + +struct ShadowMemory { + ShadowMemory(ur_context_handle_t Context, ur_device_handle_t Device) + : Context(Context), Device(Device) {} + + virtual ~ShadowMemory() {} + + virtual ur_result_t Setup() = 0; + + virtual ur_result_t Destory() = 0; + + virtual uptr MemToShadow(uptr Ptr) = 0; + + virtual ur_result_t EnqueuePoisonShadow(ur_queue_handle_t Queue, uptr Ptr, + uptr Size, u8 Value) = 0; + + virtual ur_result_t ReleaseShadow(std::shared_ptr) { + return UR_RESULT_SUCCESS; + } + + virtual size_t GetShadowSize() = 0; + + ur_context_handle_t Context{}; + + ur_device_handle_t Device{}; + + uptr ShadowBegin = 0; + + uptr ShadowEnd = 0; +}; + +struct ShadowMemoryCPU final : public ShadowMemory { + ShadowMemoryCPU(ur_context_handle_t Context, ur_device_handle_t Device) + : ShadowMemory(Context, Device) {} + + ur_result_t Setup() override; + + ur_result_t Destory() override; + + uptr MemToShadow(uptr Ptr) override; + + ur_result_t EnqueuePoisonShadow(ur_queue_handle_t Queue, uptr Ptr, + uptr Size, u8 Value) override; + + size_t GetShadowSize() override { return 0x80000000000ULL; } +}; + +struct ShadowMemoryGPU : public ShadowMemory { + ShadowMemoryGPU(ur_context_handle_t Context, ur_device_handle_t Device) + : ShadowMemory(Context, Device) {} + + ur_result_t Setup() override; + + ur_result_t Destory() override; + ur_result_t EnqueuePoisonShadow(ur_queue_handle_t Queue, uptr Ptr, + uptr Size, u8 Value) override final; + + ur_result_t ReleaseShadow(std::shared_ptr AI) override final; + + ur_mutex VirtualMemMapsMutex; + + std::unordered_map< + uptr, std::pair>>> + VirtualMemMaps; +}; + +/// Shadow Memory layout of GPU PVC device +/// +/// USM Allocation Range (56 bits) +/// Host USM : 0x0000_0000_0000_0000 ~ 0x00ff_ffff_ffff_ffff +/// Shared USM : 0x0000_0000_0000_0000 ~ 0x0000_7fff_ffff_ffff +/// Device USM : 0xff00_0000_0000_0000 ~ 0xff00_ffff_ffff_ffff +/// +/// USM Allocation Range (AllocateHostAllocationsInHeapExtendedHost=0) +/// Host USM : 0x0000_0000_0000_0000 ~ 0x0000_7fff_ffff_ffff +/// Shared USM : 0x0000_0000_0000_0000 ~ 0x0000_7fff_ffff_ffff +/// Device USM : 0xff00_0000_0000_0000 ~ 0xff00_ffff_ffff_ffff +/// +/// Shadow Memory Mapping (SHADOW_SCALE=4, AllocateHostAllocationsInHeapExtendedHost=0) +/// Host/Shared USM : 0x0 ~ 0x07ff_ffff_ffff +/// Device USM : 0x0800_0000_0000 ~ 0x17ff_ffff_ffff +/// +struct ShadowMemoryPVC final : public ShadowMemoryGPU { + ShadowMemoryPVC(ur_context_handle_t Context, ur_device_handle_t Device) + : ShadowMemoryGPU(Context, Device) {} + + uptr MemToShadow(uptr Ptr) override; + + size_t GetShadowSize() override { return 0x180000000000ULL; } +}; + +/// Shadow Memory layout of GPU PVC device +/// +/// USM Allocation Range (48 bits) +/// Host/Shared USM : 0x0000_0000_0000_0000 ~ 0x0000_7fff_ffff_ffff +/// Device USM : 0xffff_8000_0000_0000 ~ 0xffff_ffff_ffff_ffff +/// +/// Shadow Memory Mapping (SHADOW_SCALE=4) +/// Host/Shared USM : 0x0 ~ 0x07ff_ffff_ffff +/// Device USM : 0x0800_0000_0000 ~ 0x0fff_ffff_ffff +/// +struct ShadowMemoryDG2 final : public ShadowMemoryGPU { + ShadowMemoryDG2(ur_context_handle_t Context, ur_device_handle_t Device) + : ShadowMemoryGPU(Context, Device) {} + + uptr MemToShadow(uptr Ptr) override; + + size_t GetShadowSize() override { return 0x100000000000ULL; } +}; + +std::shared_ptr GetShadowMemory(ur_context_handle_t Context, + ur_device_handle_t Device, + DeviceType Type); + +} // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/asan_shadow_setup.cpp b/source/loader/layers/sanitizer/asan_shadow_setup.cpp deleted file mode 100644 index 0860c70431..0000000000 --- a/source/loader/layers/sanitizer/asan_shadow_setup.cpp +++ /dev/null @@ -1,187 +0,0 @@ -/* - * - * Copyright (C) 2024 Intel Corporation - * - * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. - * See LICENSE.TXT - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * @file asan_shadow_setup.cpp - * - */ - -#include "asan_shadow_setup.hpp" -#include "ur_sanitizer_layer.hpp" - -namespace ur_sanitizer_layer { - -namespace cpu { - -constexpr size_t SHADOW_SIZE = 0x80000000000ULL; -uptr SHADOW_BEGIN; -uptr SHADOW_END; - -bool IsShadowMemInited = false; - -ur_result_t SetupShadowMemory(uptr &ShadowBegin, uptr &ShadowEnd) { - static ur_result_t Result = []() { - SHADOW_BEGIN = MmapNoReserve(0, SHADOW_SIZE); - if (SHADOW_BEGIN == 0) { - return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } - SHADOW_END = SHADOW_BEGIN + SHADOW_SIZE; - IsShadowMemInited = true; - return UR_RESULT_SUCCESS; - }(); - ShadowBegin = SHADOW_BEGIN; - ShadowEnd = SHADOW_END; - return Result; -} - -ur_result_t DestroyShadowMemory() { - if (!IsShadowMemInited) { - return UR_RESULT_SUCCESS; - } - if (!Munmap(SHADOW_BEGIN, SHADOW_SIZE)) { - return UR_RESULT_ERROR_UNKNOWN; - } - return UR_RESULT_SUCCESS; -} - -} // namespace cpu - -namespace pvc { -/// -/// USM Allocation Range (56 bits) -/// Host USM : 0x0000_0000_0000_0000 ~ 0x00ff_ffff_ffff_ffff -/// Shared USM : 0x0000_0000_0000_0000 ~ 0x0000_7fff_ffff_ffff -/// Device USM : 0xff00_0000_0000_0000 ~ 0xff00_ffff_ffff_ffff -/// -/// USM Allocation Range (AllocateHostAllocationsInHeapExtendedHost=0) -/// Host USM : 0x0000_0000_0000_0000 ~ 0x0000_7fff_ffff_ffff -/// Shared USM : 0x0000_0000_0000_0000 ~ 0x0000_7fff_ffff_ffff -/// Device USM : 0xff00_0000_0000_0000 ~ 0xff00_ffff_ffff_ffff -/// -/// Shadow Memory Mapping (SHADOW_SCALE=4, AllocateHostAllocationsInHeapExtendedHost=0) -/// Host/Shared USM : 0x0 ~ 0x07ff_ffff_ffff -/// Device USM : 0x0800_0000_0000 ~ 0x17ff_ffff_ffff -/// -constexpr size_t SHADOW_SIZE = 0x180000000000ULL; - -uptr LOW_SHADOW_BEGIN; -uptr HIGH_SHADOW_END; - -ur_context_handle_t ShadowContext; - -ur_result_t SetupShadowMemory(ur_context_handle_t Context, uptr &ShadowBegin, - uptr &ShadowEnd) { - // Currently, Level-Zero doesn't create independent VAs for each contexts, if we reserve - // shadow memory for each contexts, this will cause out-of-resource error when user uses - // multiple contexts. Therefore, we just create one shadow memory here. - static ur_result_t Result = [&Context]() { - // TODO: Protect Bad Zone - auto Result = getContext()->urDdiTable.VirtualMem.pfnReserve( - Context, nullptr, SHADOW_SIZE, (void **)&LOW_SHADOW_BEGIN); - if (Result == UR_RESULT_SUCCESS) { - HIGH_SHADOW_END = LOW_SHADOW_BEGIN + SHADOW_SIZE; - // Retain the context which reserves shadow memory - ShadowContext = Context; - getContext()->urDdiTable.Context.pfnRetain(Context); - } - return Result; - }(); - ShadowBegin = LOW_SHADOW_BEGIN; - ShadowEnd = HIGH_SHADOW_END; - return Result; -} - -ur_result_t DestroyShadowMemory() { - static ur_result_t Result = []() { - if (!ShadowContext) { - return UR_RESULT_SUCCESS; - } - auto Result = getContext()->urDdiTable.VirtualMem.pfnFree( - ShadowContext, (const void *)LOW_SHADOW_BEGIN, SHADOW_SIZE); - getContext()->urDdiTable.Context.pfnRelease(ShadowContext); - return Result; - }(); - return Result; -} - -} // namespace pvc - -namespace dg2 { -/// -/// USM Allocation Range (48 bits) -/// Host/Shared USM : 0x0000_0000_0000_0000 ~ 0x0000_7fff_ffff_ffff -/// Device USM : 0xffff_8000_0000_0000 ~ 0xffff_ffff_ffff_ffff -/// -/// Shadow Memory Mapping (SHADOW_SCALE=4) -/// Host/Shared USM : 0x0 ~ 0x07ff_ffff_ffff -/// Device USM : 0x0800_0000_0000 ~ 0x0fff_ffff_ffff -/// -constexpr size_t SHADOW_SIZE = 0x100000000000ULL; - -uptr LOW_SHADOW_BEGIN; -uptr HIGH_SHADOW_END; - -ur_context_handle_t ShadowContext; - -ur_result_t SetupShadowMemory(ur_context_handle_t Context, uptr &ShadowBegin, - uptr &ShadowEnd) { - // Currently, Level-Zero doesn't create independent VAs for each contexts, if we reserve - // shadow memory for each contexts, this will cause out-of-resource error when user uses - // multiple contexts. Therefore, we just create one shadow memory here. - static ur_result_t Result = [&Context]() { - // TODO: Protect Bad Zone - auto Result = getContext()->urDdiTable.VirtualMem.pfnReserve( - Context, nullptr, SHADOW_SIZE, (void **)&LOW_SHADOW_BEGIN); - if (Result == UR_RESULT_SUCCESS) { - HIGH_SHADOW_END = LOW_SHADOW_BEGIN + SHADOW_SIZE; - // Retain the context which reserves shadow memory - ShadowContext = Context; - getContext()->urDdiTable.Context.pfnRetain(Context); - } - return Result; - }(); - ShadowBegin = LOW_SHADOW_BEGIN; - ShadowEnd = HIGH_SHADOW_END; - return Result; -} - -ur_result_t DestroyShadowMemory() { - static ur_result_t Result = []() { - if (!ShadowContext) { - return UR_RESULT_SUCCESS; - } - auto Result = getContext()->urDdiTable.VirtualMem.pfnFree( - ShadowContext, (const void *)LOW_SHADOW_BEGIN, SHADOW_SIZE); - getContext()->urDdiTable.Context.pfnRelease(ShadowContext); - return Result; - }(); - return Result; -} - -} // namespace dg2 - -ur_result_t SetupShadowMemoryOnCPU(uptr &ShadowBegin, uptr &ShadowEnd) { - return cpu::SetupShadowMemory(ShadowBegin, ShadowEnd); -} - -ur_result_t DestroyShadowMemoryOnCPU() { return cpu::DestroyShadowMemory(); } - -ur_result_t SetupShadowMemoryOnPVC(ur_context_handle_t Context, - uptr &ShadowBegin, uptr &ShadowEnd) { - return pvc::SetupShadowMemory(Context, ShadowBegin, ShadowEnd); -} - -ur_result_t DestroyShadowMemoryOnPVC() { return pvc::DestroyShadowMemory(); } - -ur_result_t SetupShadowMemoryOnDG2(ur_context_handle_t Context, - uptr &ShadowBegin, uptr &ShadowEnd) { - return dg2::SetupShadowMemory(Context, ShadowBegin, ShadowEnd); -} - -ur_result_t DestroyShadowMemoryOnDG2() { return dg2::DestroyShadowMemory(); } - -} // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/asan_shadow_setup.hpp b/source/loader/layers/sanitizer/asan_shadow_setup.hpp deleted file mode 100644 index cc8a27fee2..0000000000 --- a/source/loader/layers/sanitizer/asan_shadow_setup.hpp +++ /dev/null @@ -1,30 +0,0 @@ -/* - * - * Copyright (C) 2024 Intel Corporation - * - * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. - * See LICENSE.TXT - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * @file asan_shadow_setup.hpp - * - */ - -#pragma once - -#include "common.hpp" - -namespace ur_sanitizer_layer { - -ur_result_t SetupShadowMemoryOnCPU(uptr &ShadowBegin, uptr &ShadowEnd); -ur_result_t DestroyShadowMemoryOnCPU(); - -ur_result_t SetupShadowMemoryOnPVC(ur_context_handle_t Context, - uptr &ShadowBegin, uptr &ShadowEnd); -ur_result_t DestroyShadowMemoryOnPVC(); - -ur_result_t SetupShadowMemoryOnDG2(ur_context_handle_t Context, - uptr &ShadowBegin, uptr &ShadowEnd); -ur_result_t DestroyShadowMemoryOnDG2(); - -} // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/asan_statistics.cpp b/source/loader/layers/sanitizer/asan_statistics.cpp new file mode 100644 index 0000000000..82eef69c44 --- /dev/null +++ b/source/loader/layers/sanitizer/asan_statistics.cpp @@ -0,0 +1,146 @@ +/* + * + * Copyright (C) 2024 Intel Corporation + * + * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. + * See LICENSE.TXT + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * @file asan_statistics.cpp + * + */ + +#include "asan_statistics.hpp" +#include "asan_interceptor.hpp" +#include "ur_sanitizer_layer.hpp" + +#include + +namespace ur_sanitizer_layer { + +struct AsanStats { + void UpdateUSMMalloced(uptr MallocedSize, uptr RedzoneSize); + void UpdateUSMFreed(uptr FreedSize); + void UpdateUSMRealFreed(uptr FreedSize, uptr RedzoneSize); + + void UpdateShadowMmaped(uptr ShadowSize); + void UpdateShadowMalloced(uptr ShadowSize); + void UpdateShadowFreed(uptr ShadowSize); + + void Print(ur_context_handle_t Context); + + private: + std::atomic UsmMalloced; + std::atomic UsmMallocedRedzones; + + // Quarantined memory + std::atomic UsmFreed; + + std::atomic ShadowMalloced; + + double Overhead = 0.0; + + void UpdateOverhead(); +}; + +void AsanStats::Print(ur_context_handle_t Context) { + getContext()->logger.always("Stats: Context {}", (void *)Context); + getContext()->logger.always("Stats: peak memory overhead: {}%", + Overhead * 100); +} + +void AsanStats::UpdateUSMMalloced(uptr MallocedSize, uptr RedzoneSize) { + UsmMalloced += MallocedSize; + UsmMallocedRedzones += RedzoneSize; + getContext()->logger.debug( + "Stats: UpdateUSMMalloced(UsmMalloced={}, UsmMallocedRedzones={})", + UsmMalloced, UsmMallocedRedzones); + UpdateOverhead(); +} + +void AsanStats::UpdateUSMFreed(uptr FreedSize) { + UsmFreed += FreedSize; + getContext()->logger.debug("Stats: UpdateUSMFreed(UsmFreed={})", UsmFreed); +} + +void AsanStats::UpdateUSMRealFreed(uptr FreedSize, uptr RedzoneSize) { + UsmMalloced -= FreedSize; + UsmMallocedRedzones -= RedzoneSize; + if (getContext()->interceptor->getOptions().MaxQuarantineSizeMB) { + UsmFreed -= FreedSize; + } + getContext()->logger.debug( + "Stats: UpdateUSMRealFreed(UsmMalloced={}, UsmMallocedRedzones={})", + UsmMalloced, UsmMallocedRedzones); + UpdateOverhead(); +} + +void AsanStats::UpdateShadowMalloced(uptr ShadowSize) { + ShadowMalloced += ShadowSize; + getContext()->logger.debug("Stats: UpdateShadowMalloced(ShadowMalloced={})", + ShadowMalloced); + UpdateOverhead(); +} + +void AsanStats::UpdateShadowFreed(uptr ShadowSize) { + ShadowMalloced -= ShadowSize; + getContext()->logger.debug("Stats: UpdateShadowFreed(ShadowMalloced={})", + ShadowMalloced); + UpdateOverhead(); +} + +void AsanStats::UpdateOverhead() { + auto TotalSize = UsmMalloced + ShadowMalloced; + if (TotalSize == 0) { + return; + } + auto NewOverhead = + (ShadowMalloced + UsmMallocedRedzones) / (double)TotalSize; + Overhead = std::max(Overhead, NewOverhead); +} + +void AsanStatsWrapper::UpdateUSMMalloced(uptr MallocedSize, uptr RedzoneSize) { + if (Stat) { + Stat->UpdateUSMMalloced(MallocedSize, RedzoneSize); + } +} + +void AsanStatsWrapper::UpdateUSMFreed(uptr FreedSize) { + if (Stat) { + Stat->UpdateUSMFreed(FreedSize); + } +} + +void AsanStatsWrapper::UpdateUSMRealFreed(uptr FreedSize, uptr RedzoneSize) { + if (Stat) { + Stat->UpdateUSMRealFreed(FreedSize, RedzoneSize); + } +} + +void AsanStatsWrapper::UpdateShadowMalloced(uptr ShadowSize) { + if (Stat) { + Stat->UpdateShadowMalloced(ShadowSize); + } +} + +void AsanStatsWrapper::UpdateShadowFreed(uptr ShadowSize) { + if (Stat) { + Stat->UpdateShadowFreed(ShadowSize); + } +} + +void AsanStatsWrapper::Print(ur_context_handle_t Context) { + if (Stat) { + Stat->Print(Context); + } +} + +AsanStatsWrapper::AsanStatsWrapper() : Stat(nullptr) { + if (getContext()->interceptor->getOptions().PrintStats) { + Stat = new AsanStats; + } +} + +AsanStatsWrapper::~AsanStatsWrapper() { delete Stat; } + +} // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/asan_statistics.hpp b/source/loader/layers/sanitizer/asan_statistics.hpp new file mode 100644 index 0000000000..fab30e28c0 --- /dev/null +++ b/source/loader/layers/sanitizer/asan_statistics.hpp @@ -0,0 +1,39 @@ +/* + * + * Copyright (C) 2024 Intel Corporation + * + * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. + * See LICENSE.TXT + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * @file asan_statistics.hpp + * + */ + +#pragma once + +#include "common.hpp" + +namespace ur_sanitizer_layer { + +struct AsanStats; + +struct AsanStatsWrapper { + + AsanStatsWrapper(); + ~AsanStatsWrapper(); + + void UpdateUSMMalloced(uptr MallocedSize, uptr RedzoneSize); + void UpdateUSMFreed(uptr FreedSize); + void UpdateUSMRealFreed(uptr FreedSize, uptr RedzoneSize); + + void UpdateShadowMalloced(uptr ShadowSize); + void UpdateShadowFreed(uptr ShadowSize); + + void Print(ur_context_handle_t Context); + + private: + AsanStats *Stat; +}; + +} // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/asan_validator.cpp b/source/loader/layers/sanitizer/asan_validator.cpp new file mode 100644 index 0000000000..a9f2bd2b17 --- /dev/null +++ b/source/loader/layers/sanitizer/asan_validator.cpp @@ -0,0 +1,77 @@ +/* + * + * Copyright (C) 2024 Intel Corporation + * + * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. + * See LICENSE.TXT + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * @file asan_validator.cpp + * + */ + +#include "asan_validator.hpp" +#include "asan_interceptor.hpp" +#include "ur_sanitizer_utils.hpp" + +namespace ur_sanitizer_layer { + +namespace { + +bool IsSameDevice(ur_device_handle_t Device1, ur_device_handle_t Device2) { + if (Device1 == Device2) { + return true; + } + auto RootDevice1 = GetParentDevice(Device1); + RootDevice1 = RootDevice1 ? RootDevice1 : Device1; + auto RootDevice2 = GetParentDevice(Device2); + RootDevice2 = RootDevice2 ? RootDevice2 : Device2; + if (RootDevice1 == RootDevice2) { + return true; + } + return false; +} + +} // namespace + +ValidateUSMResult ValidateUSMPointer(ur_context_handle_t Context, + ur_device_handle_t Device, uptr Ptr) { + assert(Ptr != 0 && "Don't validate nullptr here"); + + auto AllocInfoItOp = getContext()->interceptor->findAllocInfoByAddress(Ptr); + if (!AllocInfoItOp) { + auto DI = getContext()->interceptor->getDeviceInfo(Device); + bool IsSupportSharedSystemUSM = DI->IsSupportSharedSystemUSM; + if (IsSupportSharedSystemUSM) { + // maybe it's host pointer + return ValidateUSMResult::success(); + } + return ValidateUSMResult::fail(ValidateUSMResult::MAYBE_HOST_POINTER); + } + + auto AllocInfo = AllocInfoItOp.value()->second; + + if (AllocInfo->Context != Context) { + return ValidateUSMResult::fail(ValidateUSMResult::BAD_CONTEXT, + AllocInfo); + } + + if (AllocInfo->Device && !IsSameDevice(AllocInfo->Device, Device)) { + return ValidateUSMResult::fail(ValidateUSMResult::BAD_DEVICE, + AllocInfo); + } + + if (AllocInfo->IsReleased) { + return ValidateUSMResult::fail(ValidateUSMResult::RELEASED_POINTER, + AllocInfo); + } + + if (Ptr < AllocInfo->UserBegin || Ptr >= AllocInfo->UserEnd) { + return ValidateUSMResult::fail(ValidateUSMResult::OUT_OF_BOUNDS, + AllocInfo); + } + + return ValidateUSMResult::success(); +} + +} // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/asan_validator.hpp b/source/loader/layers/sanitizer/asan_validator.hpp new file mode 100644 index 0000000000..52db966562 --- /dev/null +++ b/source/loader/layers/sanitizer/asan_validator.hpp @@ -0,0 +1,50 @@ +/* + * + * Copyright (C) 2024 Intel Corporation + * + * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. + * See LICENSE.TXT + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * @file asan_validator.hpp + * + */ +#pragma once + +#include "asan_allocator.hpp" + +namespace ur_sanitizer_layer { + +struct ValidateUSMResult { + enum ErrorType { + SUCCESS, + NULL_POINTER, + MAYBE_HOST_POINTER, + RELEASED_POINTER, + BAD_CONTEXT, + BAD_DEVICE, + OUT_OF_BOUNDS + }; + ErrorType Type; + std::shared_ptr AI; + + operator bool() { return Type != SUCCESS; } + + static ValidateUSMResult success() { return {SUCCESS, nullptr}; } + + static ValidateUSMResult fail(ErrorType Type, + const std::shared_ptr &AI) { + assert(Type != SUCCESS && "The error type shouldn't be SUCCESS"); + return {Type, AI}; + } + + static ValidateUSMResult fail(ErrorType Type) { + assert(Type != SUCCESS && "The error type shouldn't be SUCCESS"); + return {Type, nullptr}; + } +}; + +ValidateUSMResult ValidateUSMPointer(ur_context_handle_t Context, + ur_device_handle_t Device, uptr Ptr); + +} // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/common.hpp b/source/loader/layers/sanitizer/common.hpp index 445fedb1d8..90402b9429 100644 --- a/source/loader/layers/sanitizer/common.hpp +++ b/source/loader/layers/sanitizer/common.hpp @@ -127,13 +127,14 @@ inline uint64_t GetSizeAndRedzoneSizeForLocal(uint64_t Size, return Result; \ } +using BacktraceFrame = void *; using BacktraceInfo = std::string; struct SourceInfo { std::string file; std::string function; - int line; - int column; + int line = 0; + int column = 0; }; enum class DeviceType : uint64_t { UNKNOWN = 0, CPU, GPU_PVC, GPU_DG2 }; @@ -157,6 +158,7 @@ bool IsInASanContext(); uptr MmapNoReserve(uptr Addr, uptr Size); bool Munmap(uptr Addr, uptr Size); +bool DontCoredumpRange(uptr Addr, uptr Size); void *GetMemFunctionPointer(const char *); diff --git a/source/loader/layers/sanitizer/linux/backtrace.cpp b/source/loader/layers/sanitizer/linux/backtrace.cpp index 87c822d036..b746348205 100644 --- a/source/loader/layers/sanitizer/linux/backtrace.cpp +++ b/source/loader/layers/sanitizer/linux/backtrace.cpp @@ -15,22 +15,22 @@ namespace ur_sanitizer_layer { StackTrace GetCurrentBacktrace() { - void *Frames[MAX_BACKTRACE_FRAMES]; + BacktraceFrame Frames[MAX_BACKTRACE_FRAMES]; int FrameCount = backtrace(Frames, MAX_BACKTRACE_FRAMES); - char **Symbols = backtrace_symbols(Frames, FrameCount); - - if (Symbols == nullptr) { - return StackTrace(); - } StackTrace Stack; - for (int i = 0; i < FrameCount; i++) { - BacktraceInfo addr_info(Symbols[i]); - Stack.stack.emplace_back(std::move(addr_info)); - } - free(Symbols); + Stack.stack = + std::vector(&Frames[0], &Frames[FrameCount - 1]); return Stack; } +char **GetBacktraceSymbols(const std::vector &BacktraceFrames) { + assert(!BacktraceFrames.empty()); + + char **BacktraceSymbols = + backtrace_symbols(&BacktraceFrames[0], BacktraceFrames.size()); + return BacktraceSymbols; +} + } // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/linux/sanitizer_utils.cpp b/source/loader/layers/sanitizer/linux/sanitizer_utils.cpp index 64704180ad..d0bc038174 100644 --- a/source/loader/layers/sanitizer/linux/sanitizer_utils.cpp +++ b/source/loader/layers/sanitizer/linux/sanitizer_utils.cpp @@ -37,6 +37,12 @@ uptr MmapNoReserve(uptr Addr, uptr Size) { bool Munmap(uptr Addr, uptr Size) { return munmap((void *)Addr, Size) == 0; } +bool DontCoredumpRange(uptr Addr, uptr Size) { + Size = RoundUpTo(Size, EXEC_PAGESIZE); + Addr = RoundDownTo(Addr, EXEC_PAGESIZE); + return madvise((void *)Addr, Size, MADV_DONTDUMP) == 0; +} + void *GetMemFunctionPointer(const char *FuncName) { void *handle = dlopen(LIBC_SO, RTLD_LAZY | RTLD_NOLOAD); if (!handle) { diff --git a/source/loader/layers/sanitizer/linux/symbolizer.cpp b/source/loader/layers/sanitizer/linux/symbolizer.cpp new file mode 100644 index 0000000000..bcc90738f2 --- /dev/null +++ b/source/loader/layers/sanitizer/linux/symbolizer.cpp @@ -0,0 +1,67 @@ +/* + * + * Copyright (C) 2024 Intel Corporation + * + * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. + * See LICENSE.TXT + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + */ +#include "llvm/DebugInfo/Symbolize/DIPrinter.h" +#include "llvm/DebugInfo/Symbolize/Symbolize.h" + +namespace ur_sanitizer_layer { + +llvm::symbolize::LLVMSymbolizer *GetSymbolizer() { + static llvm::symbolize::LLVMSymbolizer Symbolizer; + return &Symbolizer; +} + +llvm::symbolize::PrinterConfig GetPrinterConfig() { + llvm::symbolize::PrinterConfig Config; + Config.Pretty = false; + Config.PrintAddress = false; + Config.PrintFunctions = true; + Config.SourceContextLines = 0; + Config.Verbose = false; + return Config; +} + +} // namespace ur_sanitizer_layer + +extern "C" { + +void SymbolizeCode(const char *ModuleName, uint64_t ModuleOffset, + char *ResultString, size_t ResultSize, size_t *RetSize) { + std::string Result; + llvm::raw_string_ostream OS(Result); + llvm::symbolize::Request Request{ModuleName, ModuleOffset}; + llvm::symbolize::PrinterConfig Config = + ur_sanitizer_layer::GetPrinterConfig(); + llvm::symbolize::ErrorHandler EH = [&](const llvm::ErrorInfoBase &ErrorInfo, + llvm::StringRef ErrorBanner) { + OS << ErrorBanner; + ErrorInfo.log(OS); + OS << '\n'; + }; + auto Printer = + std::make_unique(OS, EH, Config); + + auto ResOrErr = ur_sanitizer_layer::GetSymbolizer()->symbolizeInlinedCode( + ModuleName, + {ModuleOffset, llvm::object::SectionedAddress::UndefSection}); + + if (!ResOrErr) { + return; + } + Printer->print(Request, *ResOrErr); + ur_sanitizer_layer::GetSymbolizer()->pruneCache(); + if (RetSize) { + *RetSize = Result.size() + 1; + } + if (ResultString) { + std::strncpy(ResultString, Result.c_str(), ResultSize); + ResultString[ResultSize - 1] = '\0'; + } +} +} diff --git a/source/loader/layers/sanitizer/stacktrace.cpp b/source/loader/layers/sanitizer/stacktrace.cpp index 6dcf447765..8adaa2cd34 100644 --- a/source/loader/layers/sanitizer/stacktrace.cpp +++ b/source/loader/layers/sanitizer/stacktrace.cpp @@ -13,6 +13,14 @@ #include "stacktrace.hpp" #include "ur_sanitizer_layer.hpp" +extern "C" { + +__attribute__((weak)) void SymbolizeCode(const char *ModuleName, + uint64_t ModuleOffset, + char *ResultString, size_t ResultSize, + size_t *RetSize); +} + namespace ur_sanitizer_layer { namespace { @@ -21,6 +29,54 @@ bool Contains(const std::string &s, const char *p) { return s.find(p) != std::string::npos; } +// Parse back trace information in the following formats: +// ([function_name]+function_offset) [offset] +void ParseBacktraceInfo(BacktraceInfo BI, std::string &ModuleName, + uptr &Offset) { + // Parse module name + size_t End = BI.find_first_of('('); + assert(End != std::string::npos); + ModuleName = BI.substr(0, End); + // Parse offset + size_t Start = BI.find_first_of('['); + assert(Start != std::string::npos); + Start = BI.substr(Start + 1, 2) == "0x" ? Start + 3 : Start + 1; + End = BI.find_first_of(']'); + assert(End != std::string::npos); + Offset = std::stoull(BI.substr(Start, End), nullptr, 16); + return; +} + +// Parse symbolizer output in the following formats: +// +// :[:] +SourceInfo ParseSymbolizerOutput(const std::string &Output) { + SourceInfo Info; + // Parse function name + size_t End = Output.find_first_of('\n'); + assert(End != std::string::npos); + Info.function = Output.substr(0, End); + // Parse file name + size_t Start = End + 1; + End = Output.find_first_of(':', Start); + assert(End != std::string::npos); + Info.file = Output.substr(Start, End - Start); + // Parse line number + Start = End + 1; + End = Output.find_first_of(":\n", Start); + assert(End != std::string::npos); + Info.line = std::stoi(Output.substr(Start, End - Start)); + // Parse column number if exists + if (Output[End] == ':') { + Start = End + 1; + End = Output.find_first_of("\n", Start); + assert(End != std::string::npos); + Info.column = std::stoi(Output.substr(Start, End - Start)); + } + + return Info; +} + } // namespace void StackTrace::print() const { @@ -30,17 +86,49 @@ void StackTrace::print() const { unsigned index = 0; - for (auto &BI : stack) { + char **BacktraceSymbols = GetBacktraceSymbols(stack); + + for (size_t i = 0; i < stack.size(); i++) { + BacktraceInfo BI = BacktraceSymbols[i]; + // Skip runtime modules if (Contains(BI, "libsycl.so") || Contains(BI, "libpi_unified_runtime.so") || Contains(BI, "libur_loader.so")) { continue; } - getContext()->logger.always(" #{} {}", index, BI); + + if (&SymbolizeCode != nullptr) { + std::string Result; + std::string ModuleName; + uptr Offset; + ParseBacktraceInfo(BI, ModuleName, Offset); + size_t ResultSize = 0; + SymbolizeCode(ModuleName.c_str(), Offset, nullptr, 0, &ResultSize); + if (ResultSize) { + std::vector ResultVector(ResultSize); + SymbolizeCode(ModuleName.c_str(), Offset, ResultVector.data(), + ResultSize, nullptr); + std::string Result((char *)ResultVector.data()); + SourceInfo SrcInfo = ParseSymbolizerOutput(Result); + if (SrcInfo.file != "??") { + getContext()->logger.always(" #{} in {} {}:{}:{}", index, + SrcInfo.function, SrcInfo.file, + SrcInfo.line, SrcInfo.column); + } else { + getContext()->logger.always(" #{} in {} ({}+{})", index, + SrcInfo.function, ModuleName, + (void *)Offset); + } + } + } else { + getContext()->logger.always(" #{} {}", index, BI); + } ++index; } getContext()->logger.always(""); + + free(BacktraceSymbols); } } // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/stacktrace.hpp b/source/loader/layers/sanitizer/stacktrace.hpp index 31b661cc7d..57811bba01 100644 --- a/source/loader/layers/sanitizer/stacktrace.hpp +++ b/source/loader/layers/sanitizer/stacktrace.hpp @@ -21,11 +21,13 @@ namespace ur_sanitizer_layer { constexpr size_t MAX_BACKTRACE_FRAMES = 64; struct StackTrace { - std::vector stack; + std::vector stack; void print() const; }; StackTrace GetCurrentBacktrace(); +char **GetBacktraceSymbols(const std::vector &BacktraceFrames); + } // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/ur_sanddi.cpp b/source/loader/layers/sanitizer/ur_sanddi.cpp index 7fae0285b8..2f02c9270e 100644 --- a/source/loader/layers/sanitizer/ur_sanddi.cpp +++ b/source/loader/layers/sanitizer/ur_sanddi.cpp @@ -11,9 +11,13 @@ */ #include "asan_interceptor.hpp" +#include "asan_options.hpp" +#include "stacktrace.hpp" #include "ur_sanitizer_layer.hpp" #include "ur_sanitizer_utils.hpp" +#include + namespace ur_sanitizer_layer { namespace { @@ -31,9 +35,13 @@ ur_result_t setupContext(ur_context_handle_t Context, uint32_t numDevices, getContext()->logger.error("Unsupport device"); return UR_RESULT_ERROR_INVALID_DEVICE; } - getContext()->logger.info("Add {} into context {}", ToString(DI->Type), + getContext()->logger.info( + "DeviceInfo {} (Type={}, IsSupportSharedSystemUSM={})", + (void *)DI->Handle, ToString(DI->Type), + DI->IsSupportSharedSystemUSM); + getContext()->logger.info("Add {} into context {}", (void *)DI->Handle, (void *)Context); - if (!DI->ShadowOffset) { + if (!DI->Shadow) { UR_CALL(DI->allocShadowMemory(Context)); } CI->DeviceList.emplace_back(hDevice); @@ -44,6 +52,38 @@ ur_result_t setupContext(ur_context_handle_t Context, uint32_t numDevices, } // namespace +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urAdapterGet +__urdlllocal ur_result_t UR_APICALL urAdapterGet( + uint32_t + NumEntries, ///< [in] the number of adapters to be added to phAdapters. + ///< If phAdapters is not NULL, then NumEntries should be greater than + ///< zero, otherwise ::UR_RESULT_ERROR_INVALID_SIZE, + ///< will be returned. + ur_adapter_handle_t * + phAdapters, ///< [out][optional][range(0, NumEntries)] array of handle of adapters. + ///< If NumEntries is less than the number of adapters available, then + ///< ::urAdapterGet shall only retrieve that number of platforms. + uint32_t * + pNumAdapters ///< [out][optional] returns the total number of adapters available. +) { + auto pfnAdapterGet = getContext()->urDdiTable.Global.pfnAdapterGet; + + if (nullptr == pfnAdapterGet) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + ur_result_t result = pfnAdapterGet(NumEntries, phAdapters, pNumAdapters); + if (result == UR_RESULT_SUCCESS && phAdapters) { + const uint32_t NumAdapters = pNumAdapters ? *pNumAdapters : NumEntries; + for (uint32_t i = 0; i < NumAdapters; ++i) { + UR_CALL(getContext()->interceptor->holdAdapter(phAdapters[i])); + } + } + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urUSMHostAlloc __urdlllocal ur_result_t UR_APICALL urUSMHostAlloc( @@ -135,6 +175,112 @@ __urdlllocal ur_result_t UR_APICALL urUSMFree( return getContext()->interceptor->releaseMemory(hContext, pMem); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urProgramCreateWithIL +__urdlllocal ur_result_t UR_APICALL urProgramCreateWithIL( + ur_context_handle_t hContext, ///< [in] handle of the context instance + const void *pIL, ///< [in] pointer to IL binary. + size_t length, ///< [in] length of `pIL` in bytes. + const ur_program_properties_t * + pProperties, ///< [in][optional] pointer to program creation properties. + ur_program_handle_t + *phProgram ///< [out] pointer to handle of program object created. +) { + auto pfnProgramCreateWithIL = + getContext()->urDdiTable.Program.pfnCreateWithIL; + + if (nullptr == pfnProgramCreateWithIL) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + getContext()->logger.debug("==== urProgramCreateWithIL"); + + UR_CALL( + pfnProgramCreateWithIL(hContext, pIL, length, pProperties, phProgram)); + UR_CALL(getContext()->interceptor->insertProgram(*phProgram)); + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urProgramCreateWithBinary +__urdlllocal ur_result_t UR_APICALL urProgramCreateWithBinary( + ur_context_handle_t hContext, ///< [in] handle of the context instance + ur_device_handle_t + hDevice, ///< [in] handle to device associated with binary. + size_t size, ///< [in] size in bytes. + const uint8_t *pBinary, ///< [in] pointer to binary. + const ur_program_properties_t * + pProperties, ///< [in][optional] pointer to program creation properties. + ur_program_handle_t + *phProgram ///< [out] pointer to handle of Program object created. +) { + auto pfnProgramCreateWithBinary = + getContext()->urDdiTable.Program.pfnCreateWithBinary; + + if (nullptr == pfnProgramCreateWithBinary) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + getContext()->logger.debug("==== urProgramCreateWithBinary"); + + UR_CALL(pfnProgramCreateWithBinary(hContext, hDevice, size, pBinary, + pProperties, phProgram)); + UR_CALL(getContext()->interceptor->insertProgram(*phProgram)); + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urProgramCreateWithNativeHandle +__urdlllocal ur_result_t UR_APICALL urProgramCreateWithNativeHandle( + ur_native_handle_t + hNativeProgram, ///< [in][nocheck] the native handle of the program. + ur_context_handle_t hContext, ///< [in] handle of the context instance + const ur_program_native_properties_t * + pProperties, ///< [in][optional] pointer to native program properties struct. + ur_program_handle_t * + phProgram ///< [out] pointer to the handle of the program object created. +) { + auto pfnProgramCreateWithNativeHandle = + getContext()->urDdiTable.Program.pfnCreateWithNativeHandle; + + if (nullptr == pfnProgramCreateWithNativeHandle) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + getContext()->logger.debug("==== urProgramCreateWithNativeHandle"); + + UR_CALL(pfnProgramCreateWithNativeHandle(hNativeProgram, hContext, + pProperties, phProgram)); + UR_CALL(getContext()->interceptor->insertProgram(*phProgram)); + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urProgramRetain +__urdlllocal ur_result_t UR_APICALL urProgramRetain( + ur_program_handle_t + hProgram ///< [in][retain] handle for the Program to retain +) { + auto pfnRetain = getContext()->urDdiTable.Program.pfnRetain; + + if (nullptr == pfnRetain) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + getContext()->logger.debug("==== urProgramRetain"); + + UR_CALL(pfnRetain(hProgram)); + + auto ProgramInfo = getContext()->interceptor->getProgramInfo(hProgram); + UR_ASSERT(ProgramInfo != nullptr, UR_RESULT_ERROR_INVALID_VALUE); + ProgramInfo->RefCount++; + + return UR_RESULT_SUCCESS; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urProgramBuild __urdlllocal ur_result_t UR_APICALL urProgramBuild( @@ -152,8 +298,7 @@ __urdlllocal ur_result_t UR_APICALL urProgramBuild( UR_CALL(pfnProgramBuild(hContext, hProgram, pOptions)); - UR_CALL( - getContext()->interceptor->registerDeviceGlobals(hContext, hProgram)); + UR_CALL(getContext()->interceptor->registerProgram(hContext, hProgram)); return UR_RESULT_SUCCESS; } @@ -177,8 +322,8 @@ __urdlllocal ur_result_t UR_APICALL urProgramBuildExp( getContext()->logger.debug("==== urProgramBuildExp"); UR_CALL(pfnBuildExp(hProgram, numDevices, phDevices, pOptions)); - UR_CALL(getContext()->interceptor->registerDeviceGlobals( - GetContext(hProgram), hProgram)); + UR_CALL(getContext()->interceptor->registerProgram(GetContext(hProgram), + hProgram)); return UR_RESULT_SUCCESS; } @@ -205,8 +350,7 @@ __urdlllocal ur_result_t UR_APICALL urProgramLink( UR_CALL(pfnProgramLink(hContext, count, phPrograms, pOptions, phProgram)); - UR_CALL( - getContext()->interceptor->registerDeviceGlobals(hContext, *phProgram)); + UR_CALL(getContext()->interceptor->registerProgram(hContext, *phProgram)); return UR_RESULT_SUCCESS; } @@ -237,8 +381,33 @@ ur_result_t UR_APICALL urProgramLinkExp( UR_CALL(pfnProgramLinkExp(hContext, numDevices, phDevices, count, phPrograms, pOptions, phProgram)); - UR_CALL( - getContext()->interceptor->registerDeviceGlobals(hContext, *phProgram)); + UR_CALL(getContext()->interceptor->registerProgram(hContext, *phProgram)); + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urProgramRelease +ur_result_t UR_APICALL urProgramRelease( + ur_program_handle_t + hProgram ///< [in][release] handle for the Program to release +) { + auto pfnProgramRelease = getContext()->urDdiTable.Program.pfnRelease; + + if (nullptr == pfnProgramRelease) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + getContext()->logger.debug("==== urProgramRelease"); + + UR_CALL(pfnProgramRelease(hProgram)); + + auto ProgramInfo = getContext()->interceptor->getProgramInfo(hProgram); + UR_ASSERT(ProgramInfo != nullptr, UR_RESULT_ERROR_INVALID_VALUE); + if (--ProgramInfo->RefCount == 0) { + UR_CALL(getContext()->interceptor->unregisterProgram(hProgram)); + UR_CALL(getContext()->interceptor->eraseProgram(hProgram)); + } return UR_RESULT_SUCCESS; } @@ -371,6 +540,29 @@ __urdlllocal ur_result_t UR_APICALL urContextCreateWithNativeHandle( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urContextRetain +__urdlllocal ur_result_t UR_APICALL urContextRetain( + ur_context_handle_t + hContext ///< [in] handle of the context to get a reference of. +) { + auto pfnRetain = getContext()->urDdiTable.Context.pfnRetain; + + if (nullptr == pfnRetain) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + getContext()->logger.debug("==== urContextRetain"); + + UR_CALL(pfnRetain(hContext)); + + auto ContextInfo = getContext()->interceptor->getContextInfo(hContext); + UR_ASSERT(ContextInfo != nullptr, UR_RESULT_ERROR_INVALID_VALUE); + ContextInfo->RefCount++; + + return UR_RESULT_SUCCESS; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urContextRelease __urdlllocal ur_result_t UR_APICALL urContextRelease( @@ -384,10 +576,15 @@ __urdlllocal ur_result_t UR_APICALL urContextRelease( getContext()->logger.debug("==== urContextRelease"); - UR_CALL(getContext()->interceptor->eraseContext(hContext)); - ur_result_t result = pfnRelease(hContext); + UR_CALL(pfnRelease(hContext)); - return result; + auto ContextInfo = getContext()->interceptor->getContextInfo(hContext); + UR_ASSERT(ContextInfo != nullptr, UR_RESULT_ERROR_INVALID_VALUE); + if (--ContextInfo->RefCount == 0) { + UR_CALL(getContext()->interceptor->eraseContext(hContext)); + } + + return UR_RESULT_SUCCESS; } /////////////////////////////////////////////////////////////////////////////// @@ -424,6 +621,19 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferCreate( std::shared_ptr pMemBuffer = std::make_shared(hContext, size, hostPtrOrNull); + + if (Host && (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) { + std::shared_ptr CtxInfo = + getContext()->interceptor->getContextInfo(hContext); + for (const auto &hDevice : CtxInfo->DeviceList) { + ManagedQueue InternalQueue(hContext, hDevice); + char *Handle = nullptr; + UR_CALL(pMemBuffer->getHandle(hDevice, Handle)); + UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( + InternalQueue, true, Handle, Host, size, 0, nullptr, nullptr)); + } + } + ur_result_t result = getContext()->interceptor->insertMemBuffer(pMemBuffer); *phBuffer = ur_cast(pMemBuffer.get()); @@ -1154,9 +1364,9 @@ __urdlllocal ur_result_t UR_APICALL urKernelRetain( UR_CALL(pfnRetain(hKernel)); - if (auto KernelInfo = getContext()->interceptor->getKernelInfo(hKernel)) { - KernelInfo->RefCount++; - } + auto KernelInfo = getContext()->interceptor->getKernelInfo(hKernel); + UR_ASSERT(KernelInfo != nullptr, UR_RESULT_ERROR_INVALID_VALUE); + KernelInfo->RefCount++; return UR_RESULT_SUCCESS; } @@ -1175,10 +1385,9 @@ __urdlllocal ur_result_t urKernelRelease( getContext()->logger.debug("==== urKernelRelease"); UR_CALL(pfnRelease(hKernel)); - if (auto KernelInfo = getContext()->interceptor->getKernelInfo(hKernel)) { - if (--KernelInfo->RefCount != 0) { - return UR_RESULT_SUCCESS; - } + auto KernelInfo = getContext()->interceptor->getKernelInfo(hKernel); + UR_ASSERT(KernelInfo != nullptr, UR_RESULT_ERROR_INVALID_VALUE); + if (--KernelInfo->RefCount == 0) { UR_CALL(getContext()->interceptor->eraseKernel(hKernel)); } @@ -1283,6 +1492,69 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgLocal( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelSetArgPointer +__urdlllocal ur_result_t UR_APICALL urKernelSetArgPointer( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + uint32_t argIndex, ///< [in] argument index in range [0, num args - 1] + const ur_kernel_arg_pointer_properties_t + *pProperties, ///< [in][optional] pointer to USM pointer properties. + const void * + pArgValue ///< [in][optional] Pointer obtained by USM allocation or virtual memory + ///< mapping operation. If null then argument value is considered null. +) { + auto pfnSetArgPointer = getContext()->urDdiTable.Kernel.pfnSetArgPointer; + + if (nullptr == pfnSetArgPointer) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + getContext()->logger.debug( + "==== urKernelSetArgPointer (argIndex={}, pArgValue={})", argIndex, + pArgValue); + + if (getContext()->interceptor->getOptions().DetectKernelArguments) { + auto KI = getContext()->interceptor->getKernelInfo(hKernel); + std::scoped_lock Guard(KI->Mutex); + KI->PointerArgs[argIndex] = {pArgValue, GetCurrentBacktrace()}; + } + + ur_result_t result = + pfnSetArgPointer(hKernel, argIndex, pProperties, pArgValue); + + return result; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Exported function for filling application's Global table +/// with current process' addresses +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION +__urdlllocal ur_result_t UR_APICALL urGetGlobalProcAddrTable( + ur_api_version_t version, ///< [in] API version requested + ur_global_dditable_t + *pDdiTable ///< [in,out] pointer to table of DDI function pointers +) { + if (nullptr == pDdiTable) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (UR_MAJOR_VERSION(ur_sanitizer_layer::getContext()->version) != + UR_MAJOR_VERSION(version) || + UR_MINOR_VERSION(ur_sanitizer_layer::getContext()->version) > + UR_MINOR_VERSION(version)) { + return UR_RESULT_ERROR_UNSUPPORTED_VERSION; + } + + ur_result_t result = UR_RESULT_SUCCESS; + + pDdiTable->pfnAdapterGet = ur_sanitizer_layer::urAdapterGet; + + return result; +} /////////////////////////////////////////////////////////////////////////////// /// @brief Exported function for filling application's Context table /// with current process' addresses @@ -1310,6 +1582,7 @@ __urdlllocal ur_result_t UR_APICALL urGetContextProcAddrTable( ur_result_t result = UR_RESULT_SUCCESS; pDdiTable->pfnCreate = ur_sanitizer_layer::urContextCreate; + pDdiTable->pfnRetain = ur_sanitizer_layer::urContextRetain; pDdiTable->pfnRelease = ur_sanitizer_layer::urContextRelease; pDdiTable->pfnCreateWithNativeHandle = @@ -1341,8 +1614,15 @@ __urdlllocal ur_result_t UR_APICALL urGetProgramProcAddrTable( return UR_RESULT_ERROR_UNSUPPORTED_VERSION; } + pDdiTable->pfnCreateWithIL = ur_sanitizer_layer::urProgramCreateWithIL; + pDdiTable->pfnCreateWithBinary = + ur_sanitizer_layer::urProgramCreateWithBinary; + pDdiTable->pfnCreateWithNativeHandle = + ur_sanitizer_layer::urProgramCreateWithNativeHandle; pDdiTable->pfnBuild = ur_sanitizer_layer::urProgramBuild; pDdiTable->pfnLink = ur_sanitizer_layer::urProgramLink; + pDdiTable->pfnRetain = ur_sanitizer_layer::urProgramRetain; + pDdiTable->pfnRelease = ur_sanitizer_layer::urProgramRelease; return UR_RESULT_SUCCESS; } @@ -1379,6 +1659,7 @@ __urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetArgValue = ur_sanitizer_layer::urKernelSetArgValue; pDdiTable->pfnSetArgMemObj = ur_sanitizer_layer::urKernelSetArgMemObj; pDdiTable->pfnSetArgLocal = ur_sanitizer_layer::urKernelSetArgLocal; + pDdiTable->pfnSetArgPointer = ur_sanitizer_layer::urKernelSetArgPointer; return result; } @@ -1530,7 +1811,7 @@ ur_result_t context_t::init(ur_dditable_t *dditable, if (enabledLayerNames.count("UR_LAYER_ASAN")) { enabledType = SanitizerType::AddressSanitizer; - interceptor = std::make_unique(logger); + interceptor = std::make_unique(); } else if (enabledLayerNames.count("UR_LAYER_MSAN")) { enabledType = SanitizerType::MemorySanitizer; } else if (enabledLayerNames.count("UR_LAYER_TSAN")) { @@ -1555,6 +1836,11 @@ ur_result_t context_t::init(ur_dditable_t *dditable, urDdiTable = *dditable; + if (UR_RESULT_SUCCESS == result) { + result = ur_sanitizer_layer::urGetGlobalProcAddrTable( + UR_API_VERSION_CURRENT, &dditable->Global); + } + if (UR_RESULT_SUCCESS == result) { result = ur_sanitizer_layer::urGetContextProcAddrTable( UR_API_VERSION_CURRENT, &dditable->Context); diff --git a/source/loader/layers/sanitizer/ur_sanitizer_utils.cpp b/source/loader/layers/sanitizer/ur_sanitizer_utils.cpp index 2dd98b945d..53e4326ed4 100644 --- a/source/loader/layers/sanitizer/ur_sanitizer_utils.cpp +++ b/source/loader/layers/sanitizer/ur_sanitizer_utils.cpp @@ -72,6 +72,22 @@ ur_device_handle_t GetDevice(ur_queue_handle_t Queue) { return Device; } +std::vector GetDevices(ur_context_handle_t Context) { + std::vector Devices{}; + uint32_t DeviceNum = 0; + [[maybe_unused]] ur_result_t Result; + Result = getContext()->urDdiTable.Context.pfnGetInfo( + Context, UR_CONTEXT_INFO_NUM_DEVICES, sizeof(uint32_t), &DeviceNum, + nullptr); + assert(Result == UR_RESULT_SUCCESS && "getDevices(Context) failed"); + Devices.resize(DeviceNum); + Result = getContext()->urDdiTable.Context.pfnGetInfo( + Context, UR_CONTEXT_INFO_DEVICES, + sizeof(ur_device_handle_t) * DeviceNum, Devices.data(), nullptr); + assert(Result == UR_RESULT_SUCCESS && "getDevices(Context) failed"); + return Devices; +} + ur_program_handle_t GetProgram(ur_kernel_handle_t Kernel) { ur_program_handle_t Program{}; [[maybe_unused]] auto Result = getContext()->urDdiTable.Kernel.pfnGetInfo( @@ -135,7 +151,7 @@ DeviceType GetDeviceType(ur_context_handle_t Context, assert(Result == UR_RESULT_SUCCESS && "getDeviceType() failed at allocating device USM"); // FIXME: There's no API querying the address bits of device, so we guess it by the - // value of device USM pointer (see "USM Allocation Range" in asan_shadow_setup.cpp) + // value of device USM pointer (see "USM Allocation Range" in asan_shadow.cpp) auto Type = DeviceType::UNKNOWN; if (Ptr >> 48 == 0xff00U) { Type = DeviceType::GPU_PVC; @@ -152,18 +168,37 @@ DeviceType GetDeviceType(ur_context_handle_t Context, } } -std::vector GetProgramDevices(ur_program_handle_t Program) { - size_t PropSize; +ur_device_handle_t GetParentDevice(ur_device_handle_t Device) { + ur_device_handle_t ParentDevice{}; + [[maybe_unused]] auto Result = getContext()->urDdiTable.Device.pfnGetInfo( + Device, UR_DEVICE_INFO_PARENT_DEVICE, sizeof(ur_device_handle_t), + &ParentDevice, nullptr); + assert(Result == UR_RESULT_SUCCESS && "getParentDevice() failed"); + return ParentDevice; +} + +bool GetDeviceUSMCapability(ur_device_handle_t Device, + ur_device_info_t USMInfo) { + ur_device_usm_access_capability_flags_t Flag; + [[maybe_unused]] auto Result = getContext()->urDdiTable.Device.pfnGetInfo( + Device, USMInfo, sizeof(Flag), &Flag, nullptr); + return (bool)Flag; +} + +std::vector GetDevices(ur_program_handle_t Program) { + uint32_t DeviceNum = 0; [[maybe_unused]] ur_result_t Result = getContext()->urDdiTable.Program.pfnGetInfo( - Program, UR_PROGRAM_INFO_DEVICES, 0, nullptr, &PropSize); - assert(Result == UR_RESULT_SUCCESS); + Program, UR_PROGRAM_INFO_NUM_DEVICES, sizeof(DeviceNum), &DeviceNum, + nullptr); + assert(Result == UR_RESULT_SUCCESS && "getDevices(Program) failed"); std::vector Devices; - Devices.resize(PropSize / sizeof(ur_device_handle_t)); + Devices.resize(DeviceNum); Result = getContext()->urDdiTable.Program.pfnGetInfo( - Program, UR_PROGRAM_INFO_DEVICES, PropSize, Devices.data(), nullptr); - assert(Result == UR_RESULT_SUCCESS); + Program, UR_PROGRAM_INFO_DEVICES, + DeviceNum * sizeof(ur_device_handle_t), Devices.data(), nullptr); + assert(Result == UR_RESULT_SUCCESS && "getDevices(Program) failed"); return Devices; } @@ -207,4 +242,15 @@ size_t GetVirtualMemGranularity(ur_context_handle_t Context, return Size; } +ur_result_t EnqueueUSMBlockingSet(ur_queue_handle_t Queue, void *Ptr, + char Value, size_t Size, uint32_t NumEvents, + const ur_event_handle_t *EventWaitList, + ur_event_handle_t *OutEvent) { + if (Size == 0) { + return UR_RESULT_SUCCESS; + } + return getContext()->urDdiTable.Enqueue.pfnUSMFill( + Queue, Ptr, 1, &Value, Size, NumEvents, EventWaitList, OutEvent); +} + } // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/ur_sanitizer_utils.hpp b/source/loader/layers/sanitizer/ur_sanitizer_utils.hpp index 92cb4cebc4..a04886e5e5 100644 --- a/source/loader/layers/sanitizer/ur_sanitizer_utils.hpp +++ b/source/loader/layers/sanitizer/ur_sanitizer_utils.hpp @@ -34,12 +34,16 @@ ur_context_handle_t GetContext(ur_queue_handle_t Queue); ur_context_handle_t GetContext(ur_program_handle_t Program); ur_context_handle_t GetContext(ur_kernel_handle_t Kernel); ur_device_handle_t GetDevice(ur_queue_handle_t Queue); +std::vector GetDevices(ur_context_handle_t Context); +std::vector GetDevices(ur_program_handle_t Program); DeviceType GetDeviceType(ur_context_handle_t Context, ur_device_handle_t Device); +ur_device_handle_t GetParentDevice(ur_device_handle_t Device); +bool GetDeviceUSMCapability(ur_device_handle_t Device, + ur_device_info_t Feature); std::string GetKernelName(ur_kernel_handle_t Kernel); size_t GetDeviceLocalMemorySize(ur_device_handle_t Device); ur_program_handle_t GetProgram(ur_kernel_handle_t Kernel); -std::vector GetProgramDevices(ur_program_handle_t Program); ur_device_handle_t GetUSMAllocDevice(ur_context_handle_t Context, const void *MemPtr); uint32_t GetKernelNumArgs(ur_kernel_handle_t Kernel); @@ -50,4 +54,10 @@ size_t GetKernelPrivateMemorySize(ur_kernel_handle_t Kernel, size_t GetVirtualMemGranularity(ur_context_handle_t Context, ur_device_handle_t Device); +ur_result_t +EnqueueUSMBlockingSet(ur_queue_handle_t Queue, void *Ptr, char Value, + size_t Size, uint32_t NumEvents = 0, + const ur_event_handle_t *EventWaitList = nullptr, + ur_event_handle_t *OutEvent = nullptr); + } // namespace ur_sanitizer_layer diff --git a/source/loader/layers/tracing/ur_tracing_layer.cpp b/source/loader/layers/tracing/ur_tracing_layer.cpp index 88aff57526..7a3f30d9a8 100644 --- a/source/loader/layers/tracing/ur_tracing_layer.cpp +++ b/source/loader/layers/tracing/ur_tracing_layer.cpp @@ -21,7 +21,7 @@ namespace ur_tracing_layer { context_t *getContext() { return context_t::get_direct(); } -constexpr auto CALL_STREAM_NAME = "ur"; +constexpr auto CALL_STREAM_NAME = "ur.call"; constexpr auto STREAM_VER_MAJOR = UR_MAJOR_VERSION(UR_API_VERSION_CURRENT); constexpr auto STREAM_VER_MINOR = UR_MINOR_VERSION(UR_API_VERSION_CURRENT); @@ -29,36 +29,20 @@ constexpr auto STREAM_VER_MINOR = UR_MINOR_VERSION(UR_API_VERSION_CURRENT); // Unfortunately this doesn't match the semantics of XPTI, which can be initialized // and finalized exactly once. To workaround this, XPTI is globally initialized on // first use and finalized in the destructor. -class XptiContext { - XptiContext() { - xptiFrameworkInitialize(); - inited = true; - } - - ~XptiContext() { - xptiFrameworkFinalize(); - inited = false; - } - - // Accessing this after destruction is technically UB, but if we get there, - // it means something is calling UR after it has been destroyed at program - // exit. - std::atomic_bool inited; - - public: - static bool running() { - static XptiContext context; - return context.inited; - } +struct XptiContextManager { + XptiContextManager() { xptiFrameworkInitialize(); } + ~XptiContextManager() { xptiFrameworkFinalize(); } }; +static std::shared_ptr xptiContextManagerGet() { + static auto contextManager = std::make_shared(); + return contextManager; +} static thread_local xpti_td *activeEvent; /////////////////////////////////////////////////////////////////////////////// context_t::context_t() : logger(logger::create_logger("tracing", true, true)) { - if (!XptiContext::running()) { - return; - } + this->xptiContextManager = xptiContextManagerGet(); call_stream_id = xptiRegisterStream(CALL_STREAM_NAME); std::ostringstream streamv; @@ -69,20 +53,12 @@ context_t::context_t() : logger(logger::create_logger("tracing", true, true)) { void context_t::notify(uint16_t trace_type, uint32_t id, const char *name, void *args, ur_result_t *resultp, uint64_t instance) { - if (!XptiContext::running()) { - return; - } - xpti::function_with_args_t payload{id, name, args, resultp, nullptr}; xptiNotifySubscribers(call_stream_id, trace_type, nullptr, activeEvent, instance, &payload); } uint64_t context_t::notify_begin(uint32_t id, const char *name, void *args) { - if (!XptiContext::running()) { - return 0; - } - if (auto loc = codelocData.get_codeloc()) { xpti::payload_t payload = xpti::payload_t(loc->functionName, loc->sourceFile, loc->lineNumber, @@ -101,20 +77,10 @@ uint64_t context_t::notify_begin(uint32_t id, const char *name, void *args) { void context_t::notify_end(uint32_t id, const char *name, void *args, ur_result_t *resultp, uint64_t instance) { - if (!XptiContext::running()) { - return; - } - notify((uint16_t)xpti::trace_point_type_t::function_with_args_end, id, name, args, resultp, instance); } /////////////////////////////////////////////////////////////////////////////// -context_t::~context_t() { - if (!XptiContext::running()) { - return; - } - - xptiFinalize(CALL_STREAM_NAME); -} +context_t::~context_t() { xptiFinalize(CALL_STREAM_NAME); } } // namespace ur_tracing_layer diff --git a/source/loader/layers/tracing/ur_tracing_layer.hpp b/source/loader/layers/tracing/ur_tracing_layer.hpp index 1a5c542ee6..b7e3fc0314 100644 --- a/source/loader/layers/tracing/ur_tracing_layer.hpp +++ b/source/loader/layers/tracing/ur_tracing_layer.hpp @@ -21,6 +21,8 @@ #define TRACING_COMP_NAME "tracing layer" namespace ur_tracing_layer { +struct XptiContextManager; + /////////////////////////////////////////////////////////////////////////////// class __urdlllocal context_t : public proxy_layer_context_t, public AtomicSingleton { @@ -47,6 +49,8 @@ class __urdlllocal context_t : public proxy_layer_context_t, uint8_t call_stream_id; inline static const std::string name = "UR_LAYER_TRACING"; + + std::shared_ptr xptiContextManager; }; context_t *getContext(); diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index 5740b9eebe..c206bd1fc4 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -26,7 +26,7 @@ __urdlllocal ur_result_t UR_APICALL urAdapterGet( ur_adapter_handle_t * phAdapters, ///< [out][optional][range(0, NumEntries)] array of handle of adapters. ///< If NumEntries is less than the number of adapters available, then - ///< ::urAdapterGet shall only retrieve that number of platforms. + ///< ::urAdapterGet shall only retrieve that number of adapters. uint32_t * pNumAdapters ///< [out][optional] returns the total number of adapters available. ) { @@ -40,16 +40,21 @@ __urdlllocal ur_result_t UR_APICALL urAdapterGet( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_ADAPTER_GET, "urAdapterGet", ¶ms); - getContext()->logger.info("---> urAdapterGet"); + auto &logger = getContext()->logger; + logger.info(" ---> urAdapterGet\n"); ur_result_t result = pfnAdapterGet(NumEntries, phAdapters, pNumAdapters); getContext()->notify_end(UR_FUNCTION_ADAPTER_GET, "urAdapterGet", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_ADAPTER_GET, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_ADAPTER_GET, + ¶ms); + logger.info(" <--- urAdapterGet({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -69,17 +74,21 @@ __urdlllocal ur_result_t UR_APICALL urAdapterRelease( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_ADAPTER_RELEASE, "urAdapterRelease", ¶ms); - getContext()->logger.info("---> urAdapterRelease"); + auto &logger = getContext()->logger; + logger.info(" ---> urAdapterRelease\n"); ur_result_t result = pfnAdapterRelease(hAdapter); getContext()->notify_end(UR_FUNCTION_ADAPTER_RELEASE, "urAdapterRelease", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_ADAPTER_RELEASE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_ADAPTER_RELEASE, + ¶ms); + logger.info(" <--- urAdapterRelease({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -99,17 +108,21 @@ __urdlllocal ur_result_t UR_APICALL urAdapterRetain( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_ADAPTER_RETAIN, "urAdapterRetain", ¶ms); - getContext()->logger.info("---> urAdapterRetain"); + auto &logger = getContext()->logger; + logger.info(" ---> urAdapterRetain\n"); ur_result_t result = pfnAdapterRetain(hAdapter); getContext()->notify_end(UR_FUNCTION_ADAPTER_RETAIN, "urAdapterRetain", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_ADAPTER_RETAIN, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_ADAPTER_RETAIN, + ¶ms); + logger.info(" <--- urAdapterRetain({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -137,7 +150,8 @@ __urdlllocal ur_result_t UR_APICALL urAdapterGetLastError( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_ADAPTER_GET_LAST_ERROR, "urAdapterGetLastError", ¶ms); - getContext()->logger.info("---> urAdapterGetLastError"); + auto &logger = getContext()->logger; + logger.info(" ---> urAdapterGetLastError\n"); ur_result_t result = pfnAdapterGetLastError(hAdapter, ppMessage, pError); @@ -145,10 +159,13 @@ __urdlllocal ur_result_t UR_APICALL urAdapterGetLastError( "urAdapterGetLastError", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ADAPTER_GET_LAST_ERROR, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ADAPTER_GET_LAST_ERROR, ¶ms); + logger.info(" <--- urAdapterGetLastError({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -179,7 +196,8 @@ __urdlllocal ur_result_t UR_APICALL urAdapterGetInfo( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_ADAPTER_GET_INFO, "urAdapterGetInfo", ¶ms); - getContext()->logger.info("---> urAdapterGetInfo"); + auto &logger = getContext()->logger; + logger.info(" ---> urAdapterGetInfo\n"); ur_result_t result = pfnAdapterGetInfo(hAdapter, propName, propSize, pPropValue, pPropSizeRet); @@ -187,10 +205,13 @@ __urdlllocal ur_result_t UR_APICALL urAdapterGetInfo( getContext()->notify_end(UR_FUNCTION_ADAPTER_GET_INFO, "urAdapterGetInfo", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_ADAPTER_GET_INFO, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_ADAPTER_GET_INFO, + ¶ms); + logger.info(" <--- urAdapterGetInfo({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -224,7 +245,8 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGet( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_PLATFORM_GET, "urPlatformGet", ¶ms); - getContext()->logger.info("---> urPlatformGet"); + auto &logger = getContext()->logger; + logger.info(" ---> urPlatformGet\n"); ur_result_t result = pfnGet(phAdapters, NumAdapters, NumEntries, phPlatforms, pNumPlatforms); @@ -232,10 +254,13 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGet( getContext()->notify_end(UR_FUNCTION_PLATFORM_GET, "urPlatformGet", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_PLATFORM_GET, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PLATFORM_GET, + ¶ms); + logger.info(" <--- urPlatformGet({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -266,7 +291,8 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetInfo( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_PLATFORM_GET_INFO, "urPlatformGetInfo", ¶ms); - getContext()->logger.info("---> urPlatformGetInfo"); + auto &logger = getContext()->logger; + logger.info(" ---> urPlatformGetInfo\n"); ur_result_t result = pfnGetInfo(hPlatform, propName, propSize, pPropValue, pPropSizeRet); @@ -274,10 +300,13 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetInfo( getContext()->notify_end(UR_FUNCTION_PLATFORM_GET_INFO, "urPlatformGetInfo", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_PLATFORM_GET_INFO, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PLATFORM_GET_INFO, + ¶ms); + logger.info(" <--- urPlatformGetInfo({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -299,7 +328,8 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetApiVersion( getContext()->notify_begin(UR_FUNCTION_PLATFORM_GET_API_VERSION, "urPlatformGetApiVersion", ¶ms); - getContext()->logger.info("---> urPlatformGetApiVersion"); + auto &logger = getContext()->logger; + logger.info(" ---> urPlatformGetApiVersion\n"); ur_result_t result = pfnGetApiVersion(hPlatform, pVersion); @@ -307,10 +337,13 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetApiVersion( "urPlatformGetApiVersion", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_PLATFORM_GET_API_VERSION, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PLATFORM_GET_API_VERSION, ¶ms); + logger.info(" <--- urPlatformGetApiVersion({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -335,7 +368,8 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetNativeHandle( getContext()->notify_begin(UR_FUNCTION_PLATFORM_GET_NATIVE_HANDLE, "urPlatformGetNativeHandle", ¶ms); - getContext()->logger.info("---> urPlatformGetNativeHandle"); + auto &logger = getContext()->logger; + logger.info(" ---> urPlatformGetNativeHandle\n"); ur_result_t result = pfnGetNativeHandle(hPlatform, phNativePlatform); @@ -343,10 +377,13 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetNativeHandle( "urPlatformGetNativeHandle", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_PLATFORM_GET_NATIVE_HANDLE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PLATFORM_GET_NATIVE_HANDLE, ¶ms); + logger.info(" <--- urPlatformGetNativeHandle({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -376,7 +413,8 @@ __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( UR_FUNCTION_PLATFORM_CREATE_WITH_NATIVE_HANDLE, "urPlatformCreateWithNativeHandle", ¶ms); - getContext()->logger.info("---> urPlatformCreateWithNativeHandle"); + auto &logger = getContext()->logger; + logger.info(" ---> urPlatformCreateWithNativeHandle\n"); ur_result_t result = pfnCreateWithNativeHandle(hNativePlatform, hAdapter, pProperties, phPlatform); @@ -385,10 +423,13 @@ __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( "urPlatformCreateWithNativeHandle", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_PLATFORM_CREATE_WITH_NATIVE_HANDLE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PLATFORM_CREATE_WITH_NATIVE_HANDLE, ¶ms); + logger.info(" <--- urPlatformCreateWithNativeHandle({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -416,7 +457,8 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetBackendOption( getContext()->notify_begin(UR_FUNCTION_PLATFORM_GET_BACKEND_OPTION, "urPlatformGetBackendOption", ¶ms); - getContext()->logger.info("---> urPlatformGetBackendOption"); + auto &logger = getContext()->logger; + logger.info(" ---> urPlatformGetBackendOption\n"); ur_result_t result = pfnGetBackendOption(hPlatform, pFrontendOption, ppPlatformOption); @@ -425,10 +467,13 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetBackendOption( "urPlatformGetBackendOption", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_PLATFORM_GET_BACKEND_OPTION, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PLATFORM_GET_BACKEND_OPTION, ¶ms); + logger.info(" <--- urPlatformGetBackendOption({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -461,7 +506,8 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGet( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_DEVICE_GET, "urDeviceGet", ¶ms); - getContext()->logger.info("---> urDeviceGet"); + auto &logger = getContext()->logger; + logger.info(" ---> urDeviceGet\n"); ur_result_t result = pfnGet(hPlatform, DeviceType, NumEntries, phDevices, pNumDevices); @@ -469,9 +515,12 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGet( getContext()->notify_end(UR_FUNCTION_DEVICE_GET, "urDeviceGet", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_GET, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_GET, + ¶ms); + logger.info(" <--- urDeviceGet({}) -> {};\n", args_str.str(), result); + } return result; } @@ -503,7 +552,8 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetInfo( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_DEVICE_GET_INFO, "urDeviceGetInfo", ¶ms); - getContext()->logger.info("---> urDeviceGetInfo"); + auto &logger = getContext()->logger; + logger.info(" ---> urDeviceGetInfo\n"); ur_result_t result = pfnGetInfo(hDevice, propName, propSize, pPropValue, pPropSizeRet); @@ -511,10 +561,13 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetInfo( getContext()->notify_end(UR_FUNCTION_DEVICE_GET_INFO, "urDeviceGetInfo", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_GET_INFO, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_GET_INFO, + ¶ms); + logger.info(" <--- urDeviceGetInfo({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -535,17 +588,21 @@ __urdlllocal ur_result_t UR_APICALL urDeviceRetain( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_DEVICE_RETAIN, "urDeviceRetain", ¶ms); - getContext()->logger.info("---> urDeviceRetain"); + auto &logger = getContext()->logger; + logger.info(" ---> urDeviceRetain\n"); ur_result_t result = pfnRetain(hDevice); getContext()->notify_end(UR_FUNCTION_DEVICE_RETAIN, "urDeviceRetain", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_RETAIN, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_RETAIN, + ¶ms); + logger.info(" <--- urDeviceRetain({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -566,17 +623,21 @@ __urdlllocal ur_result_t UR_APICALL urDeviceRelease( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_DEVICE_RELEASE, "urDeviceRelease", ¶ms); - getContext()->logger.info("---> urDeviceRelease"); + auto &logger = getContext()->logger; + logger.info(" ---> urDeviceRelease\n"); ur_result_t result = pfnRelease(hDevice); getContext()->notify_end(UR_FUNCTION_DEVICE_RELEASE, "urDeviceRelease", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_RELEASE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_RELEASE, + ¶ms); + logger.info(" <--- urDeviceRelease({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -607,7 +668,8 @@ __urdlllocal ur_result_t UR_APICALL urDevicePartition( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_DEVICE_PARTITION, "urDevicePartition", ¶ms); - getContext()->logger.info("---> urDevicePartition"); + auto &logger = getContext()->logger; + logger.info(" ---> urDevicePartition\n"); ur_result_t result = pfnPartition(hDevice, pProperties, NumDevices, phSubDevices, pNumDevicesRet); @@ -615,10 +677,13 @@ __urdlllocal ur_result_t UR_APICALL urDevicePartition( getContext()->notify_end(UR_FUNCTION_DEVICE_PARTITION, "urDevicePartition", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_PARTITION, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_PARTITION, + ¶ms); + logger.info(" <--- urDevicePartition({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -648,7 +713,8 @@ __urdlllocal ur_result_t UR_APICALL urDeviceSelectBinary( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_DEVICE_SELECT_BINARY, "urDeviceSelectBinary", ¶ms); - getContext()->logger.info("---> urDeviceSelectBinary"); + auto &logger = getContext()->logger; + logger.info(" ---> urDeviceSelectBinary\n"); ur_result_t result = pfnSelectBinary(hDevice, pBinaries, NumBinaries, pSelectedBinary); @@ -657,10 +723,13 @@ __urdlllocal ur_result_t UR_APICALL urDeviceSelectBinary( "urDeviceSelectBinary", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_SELECT_BINARY, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_DEVICE_SELECT_BINARY, ¶ms); + logger.info(" <--- urDeviceSelectBinary({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -684,7 +753,8 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetNativeHandle( getContext()->notify_begin(UR_FUNCTION_DEVICE_GET_NATIVE_HANDLE, "urDeviceGetNativeHandle", ¶ms); - getContext()->logger.info("---> urDeviceGetNativeHandle"); + auto &logger = getContext()->logger; + logger.info(" ---> urDeviceGetNativeHandle\n"); ur_result_t result = pfnGetNativeHandle(hDevice, phNativeDevice); @@ -692,10 +762,13 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetNativeHandle( "urDeviceGetNativeHandle", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_DEVICE_GET_NATIVE_HANDLE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_DEVICE_GET_NATIVE_HANDLE, ¶ms); + logger.info(" <--- urDeviceGetNativeHandle({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -705,7 +778,8 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetNativeHandle( __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_native_handle_t hNativeDevice, ///< [in][nocheck] the native handle of the device. - ur_platform_handle_t hPlatform, ///< [in] handle of the platform instance + ur_adapter_handle_t + hAdapter, ///< [in] handle of the adapter to which `hNativeDevice` belongs const ur_device_native_properties_t * pProperties, ///< [in][optional] pointer to native device properties struct. ur_device_handle_t @@ -719,24 +793,28 @@ __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( } ur_device_create_with_native_handle_params_t params = { - &hNativeDevice, &hPlatform, &pProperties, &phDevice}; + &hNativeDevice, &hAdapter, &pProperties, &phDevice}; uint64_t instance = getContext()->notify_begin(UR_FUNCTION_DEVICE_CREATE_WITH_NATIVE_HANDLE, "urDeviceCreateWithNativeHandle", ¶ms); - getContext()->logger.info("---> urDeviceCreateWithNativeHandle"); + auto &logger = getContext()->logger; + logger.info(" ---> urDeviceCreateWithNativeHandle\n"); - ur_result_t result = pfnCreateWithNativeHandle(hNativeDevice, hPlatform, + ur_result_t result = pfnCreateWithNativeHandle(hNativeDevice, hAdapter, pProperties, phDevice); getContext()->notify_end(UR_FUNCTION_DEVICE_CREATE_WITH_NATIVE_HANDLE, "urDeviceCreateWithNativeHandle", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_DEVICE_CREATE_WITH_NATIVE_HANDLE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_DEVICE_CREATE_WITH_NATIVE_HANDLE, ¶ms); + logger.info(" <--- urDeviceCreateWithNativeHandle({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -765,7 +843,8 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetGlobalTimestamps( getContext()->notify_begin(UR_FUNCTION_DEVICE_GET_GLOBAL_TIMESTAMPS, "urDeviceGetGlobalTimestamps", ¶ms); - getContext()->logger.info("---> urDeviceGetGlobalTimestamps"); + auto &logger = getContext()->logger; + logger.info(" ---> urDeviceGetGlobalTimestamps\n"); ur_result_t result = pfnGetGlobalTimestamps(hDevice, pDeviceTimestamp, pHostTimestamp); @@ -774,10 +853,13 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetGlobalTimestamps( "urDeviceGetGlobalTimestamps", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_DEVICE_GET_GLOBAL_TIMESTAMPS, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_DEVICE_GET_GLOBAL_TIMESTAMPS, ¶ms); + logger.info(" <--- urDeviceGetGlobalTimestamps({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -804,7 +886,8 @@ __urdlllocal ur_result_t UR_APICALL urContextCreate( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_CONTEXT_CREATE, "urContextCreate", ¶ms); - getContext()->logger.info("---> urContextCreate"); + auto &logger = getContext()->logger; + logger.info(" ---> urContextCreate\n"); ur_result_t result = pfnCreate(DeviceCount, phDevices, pProperties, phContext); @@ -812,10 +895,13 @@ __urdlllocal ur_result_t UR_APICALL urContextCreate( getContext()->notify_end(UR_FUNCTION_CONTEXT_CREATE, "urContextCreate", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_CONTEXT_CREATE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_CONTEXT_CREATE, + ¶ms); + logger.info(" <--- urContextCreate({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -836,17 +922,21 @@ __urdlllocal ur_result_t UR_APICALL urContextRetain( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_CONTEXT_RETAIN, "urContextRetain", ¶ms); - getContext()->logger.info("---> urContextRetain"); + auto &logger = getContext()->logger; + logger.info(" ---> urContextRetain\n"); ur_result_t result = pfnRetain(hContext); getContext()->notify_end(UR_FUNCTION_CONTEXT_RETAIN, "urContextRetain", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_CONTEXT_RETAIN, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_CONTEXT_RETAIN, + ¶ms); + logger.info(" <--- urContextRetain({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -867,17 +957,21 @@ __urdlllocal ur_result_t UR_APICALL urContextRelease( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_CONTEXT_RELEASE, "urContextRelease", ¶ms); - getContext()->logger.info("---> urContextRelease"); + auto &logger = getContext()->logger; + logger.info(" ---> urContextRelease\n"); ur_result_t result = pfnRelease(hContext); getContext()->notify_end(UR_FUNCTION_CONTEXT_RELEASE, "urContextRelease", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_CONTEXT_RELEASE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_CONTEXT_RELEASE, + ¶ms); + logger.info(" <--- urContextRelease({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -910,7 +1004,8 @@ __urdlllocal ur_result_t UR_APICALL urContextGetInfo( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_CONTEXT_GET_INFO, "urContextGetInfo", ¶ms); - getContext()->logger.info("---> urContextGetInfo"); + auto &logger = getContext()->logger; + logger.info(" ---> urContextGetInfo\n"); ur_result_t result = pfnGetInfo(hContext, propName, propSize, pPropValue, pPropSizeRet); @@ -918,10 +1013,13 @@ __urdlllocal ur_result_t UR_APICALL urContextGetInfo( getContext()->notify_end(UR_FUNCTION_CONTEXT_GET_INFO, "urContextGetInfo", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_CONTEXT_GET_INFO, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_CONTEXT_GET_INFO, + ¶ms); + logger.info(" <--- urContextGetInfo({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -946,7 +1044,8 @@ __urdlllocal ur_result_t UR_APICALL urContextGetNativeHandle( getContext()->notify_begin(UR_FUNCTION_CONTEXT_GET_NATIVE_HANDLE, "urContextGetNativeHandle", ¶ms); - getContext()->logger.info("---> urContextGetNativeHandle"); + auto &logger = getContext()->logger; + logger.info(" ---> urContextGetNativeHandle\n"); ur_result_t result = pfnGetNativeHandle(hContext, phNativeContext); @@ -954,10 +1053,13 @@ __urdlllocal ur_result_t UR_APICALL urContextGetNativeHandle( "urContextGetNativeHandle", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_CONTEXT_GET_NATIVE_HANDLE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_CONTEXT_GET_NATIVE_HANDLE, ¶ms); + logger.info(" <--- urContextGetNativeHandle({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -992,7 +1094,8 @@ __urdlllocal ur_result_t UR_APICALL urContextCreateWithNativeHandle( UR_FUNCTION_CONTEXT_CREATE_WITH_NATIVE_HANDLE, "urContextCreateWithNativeHandle", ¶ms); - getContext()->logger.info("---> urContextCreateWithNativeHandle"); + auto &logger = getContext()->logger; + logger.info(" ---> urContextCreateWithNativeHandle\n"); ur_result_t result = pfnCreateWithNativeHandle(hNativeContext, hAdapter, numDevices, @@ -1002,10 +1105,13 @@ __urdlllocal ur_result_t UR_APICALL urContextCreateWithNativeHandle( "urContextCreateWithNativeHandle", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_CONTEXT_CREATE_WITH_NATIVE_HANDLE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_CONTEXT_CREATE_WITH_NATIVE_HANDLE, ¶ms); + logger.info(" <--- urContextCreateWithNativeHandle({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -1032,7 +1138,8 @@ __urdlllocal ur_result_t UR_APICALL urContextSetExtendedDeleter( getContext()->notify_begin(UR_FUNCTION_CONTEXT_SET_EXTENDED_DELETER, "urContextSetExtendedDeleter", ¶ms); - getContext()->logger.info("---> urContextSetExtendedDeleter"); + auto &logger = getContext()->logger; + logger.info(" ---> urContextSetExtendedDeleter\n"); ur_result_t result = pfnSetExtendedDeleter(hContext, pfnDeleter, pUserData); @@ -1040,10 +1147,13 @@ __urdlllocal ur_result_t UR_APICALL urContextSetExtendedDeleter( "urContextSetExtendedDeleter", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_CONTEXT_SET_EXTENDED_DELETER, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_CONTEXT_SET_EXTENDED_DELETER, ¶ms); + logger.info(" <--- urContextSetExtendedDeleter({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -1070,7 +1180,8 @@ __urdlllocal ur_result_t UR_APICALL urMemImageCreate( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_MEM_IMAGE_CREATE, "urMemImageCreate", ¶ms); - getContext()->logger.info("---> urMemImageCreate"); + auto &logger = getContext()->logger; + logger.info(" ---> urMemImageCreate\n"); ur_result_t result = pfnImageCreate(hContext, flags, pImageFormat, pImageDesc, pHost, phMem); @@ -1078,10 +1189,13 @@ __urdlllocal ur_result_t UR_APICALL urMemImageCreate( getContext()->notify_end(UR_FUNCTION_MEM_IMAGE_CREATE, "urMemImageCreate", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_IMAGE_CREATE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_IMAGE_CREATE, + ¶ms); + logger.info(" <--- urMemImageCreate({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -1108,7 +1222,8 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferCreate( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_MEM_BUFFER_CREATE, "urMemBufferCreate", ¶ms); - getContext()->logger.info("---> urMemBufferCreate"); + auto &logger = getContext()->logger; + logger.info(" ---> urMemBufferCreate\n"); ur_result_t result = pfnBufferCreate(hContext, flags, size, pProperties, phBuffer); @@ -1116,10 +1231,13 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferCreate( getContext()->notify_end(UR_FUNCTION_MEM_BUFFER_CREATE, "urMemBufferCreate", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_BUFFER_CREATE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_BUFFER_CREATE, + ¶ms); + logger.info(" <--- urMemBufferCreate({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -1140,16 +1258,20 @@ __urdlllocal ur_result_t UR_APICALL urMemRetain( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_MEM_RETAIN, "urMemRetain", ¶ms); - getContext()->logger.info("---> urMemRetain"); + auto &logger = getContext()->logger; + logger.info(" ---> urMemRetain\n"); ur_result_t result = pfnRetain(hMem); getContext()->notify_end(UR_FUNCTION_MEM_RETAIN, "urMemRetain", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_RETAIN, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_RETAIN, + ¶ms); + logger.info(" <--- urMemRetain({}) -> {};\n", args_str.str(), result); + } return result; } @@ -1170,16 +1292,21 @@ __urdlllocal ur_result_t UR_APICALL urMemRelease( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_MEM_RELEASE, "urMemRelease", ¶ms); - getContext()->logger.info("---> urMemRelease"); + auto &logger = getContext()->logger; + logger.info(" ---> urMemRelease\n"); ur_result_t result = pfnRelease(hMem); getContext()->notify_end(UR_FUNCTION_MEM_RELEASE, "urMemRelease", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_RELEASE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_RELEASE, + ¶ms); + logger.info(" <--- urMemRelease({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -1207,7 +1334,8 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferPartition( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_MEM_BUFFER_PARTITION, "urMemBufferPartition", ¶ms); - getContext()->logger.info("---> urMemBufferPartition"); + auto &logger = getContext()->logger; + logger.info(" ---> urMemBufferPartition\n"); ur_result_t result = pfnBufferPartition(hBuffer, flags, bufferCreateType, pRegion, phMem); @@ -1216,10 +1344,13 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferPartition( "urMemBufferPartition", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_BUFFER_PARTITION, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_MEM_BUFFER_PARTITION, ¶ms); + logger.info(" <--- urMemBufferPartition({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -1244,7 +1375,8 @@ __urdlllocal ur_result_t UR_APICALL urMemGetNativeHandle( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_MEM_GET_NATIVE_HANDLE, "urMemGetNativeHandle", ¶ms); - getContext()->logger.info("---> urMemGetNativeHandle"); + auto &logger = getContext()->logger; + logger.info(" ---> urMemGetNativeHandle\n"); ur_result_t result = pfnGetNativeHandle(hMem, hDevice, phNativeMem); @@ -1252,10 +1384,13 @@ __urdlllocal ur_result_t UR_APICALL urMemGetNativeHandle( "urMemGetNativeHandle", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_GET_NATIVE_HANDLE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_MEM_GET_NATIVE_HANDLE, ¶ms); + logger.info(" <--- urMemGetNativeHandle({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -1284,7 +1419,8 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle( UR_FUNCTION_MEM_BUFFER_CREATE_WITH_NATIVE_HANDLE, "urMemBufferCreateWithNativeHandle", ¶ms); - getContext()->logger.info("---> urMemBufferCreateWithNativeHandle"); + auto &logger = getContext()->logger; + logger.info(" ---> urMemBufferCreateWithNativeHandle\n"); ur_result_t result = pfnBufferCreateWithNativeHandle(hNativeMem, hContext, pProperties, phMem); @@ -1293,10 +1429,14 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle( "urMemBufferCreateWithNativeHandle", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_MEM_BUFFER_CREATE_WITH_NATIVE_HANDLE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_MEM_BUFFER_CREATE_WITH_NATIVE_HANDLE, + ¶ms); + logger.info(" <--- urMemBufferCreateWithNativeHandle({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -1329,7 +1469,8 @@ __urdlllocal ur_result_t UR_APICALL urMemImageCreateWithNativeHandle( UR_FUNCTION_MEM_IMAGE_CREATE_WITH_NATIVE_HANDLE, "urMemImageCreateWithNativeHandle", ¶ms); - getContext()->logger.info("---> urMemImageCreateWithNativeHandle"); + auto &logger = getContext()->logger; + logger.info(" ---> urMemImageCreateWithNativeHandle\n"); ur_result_t result = pfnImageCreateWithNativeHandle( hNativeMem, hContext, pImageFormat, pImageDesc, pProperties, phMem); @@ -1338,10 +1479,13 @@ __urdlllocal ur_result_t UR_APICALL urMemImageCreateWithNativeHandle( "urMemImageCreateWithNativeHandle", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_MEM_IMAGE_CREATE_WITH_NATIVE_HANDLE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_MEM_IMAGE_CREATE_WITH_NATIVE_HANDLE, ¶ms); + logger.info(" <--- urMemImageCreateWithNativeHandle({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -1374,7 +1518,8 @@ __urdlllocal ur_result_t UR_APICALL urMemGetInfo( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_MEM_GET_INFO, "urMemGetInfo", ¶ms); - getContext()->logger.info("---> urMemGetInfo"); + auto &logger = getContext()->logger; + logger.info(" ---> urMemGetInfo\n"); ur_result_t result = pfnGetInfo(hMemory, propName, propSize, pPropValue, pPropSizeRet); @@ -1382,10 +1527,13 @@ __urdlllocal ur_result_t UR_APICALL urMemGetInfo( getContext()->notify_end(UR_FUNCTION_MEM_GET_INFO, "urMemGetInfo", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_GET_INFO, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_GET_INFO, + ¶ms); + logger.info(" <--- urMemGetInfo({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -1417,7 +1565,8 @@ __urdlllocal ur_result_t UR_APICALL urMemImageGetInfo( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_MEM_IMAGE_GET_INFO, "urMemImageGetInfo", ¶ms); - getContext()->logger.info("---> urMemImageGetInfo"); + auto &logger = getContext()->logger; + logger.info(" ---> urMemImageGetInfo\n"); ur_result_t result = pfnImageGetInfo(hMemory, propName, propSize, pPropValue, pPropSizeRet); @@ -1425,10 +1574,13 @@ __urdlllocal ur_result_t UR_APICALL urMemImageGetInfo( getContext()->notify_end(UR_FUNCTION_MEM_IMAGE_GET_INFO, "urMemImageGetInfo", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_IMAGE_GET_INFO, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_MEM_IMAGE_GET_INFO, ¶ms); + logger.info(" <--- urMemImageGetInfo({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -1451,17 +1603,21 @@ __urdlllocal ur_result_t UR_APICALL urSamplerCreate( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_SAMPLER_CREATE, "urSamplerCreate", ¶ms); - getContext()->logger.info("---> urSamplerCreate"); + auto &logger = getContext()->logger; + logger.info(" ---> urSamplerCreate\n"); ur_result_t result = pfnCreate(hContext, pDesc, phSampler); getContext()->notify_end(UR_FUNCTION_SAMPLER_CREATE, "urSamplerCreate", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_SAMPLER_CREATE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_SAMPLER_CREATE, + ¶ms); + logger.info(" <--- urSamplerCreate({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -1482,17 +1638,21 @@ __urdlllocal ur_result_t UR_APICALL urSamplerRetain( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_SAMPLER_RETAIN, "urSamplerRetain", ¶ms); - getContext()->logger.info("---> urSamplerRetain"); + auto &logger = getContext()->logger; + logger.info(" ---> urSamplerRetain\n"); ur_result_t result = pfnRetain(hSampler); getContext()->notify_end(UR_FUNCTION_SAMPLER_RETAIN, "urSamplerRetain", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_SAMPLER_RETAIN, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_SAMPLER_RETAIN, + ¶ms); + logger.info(" <--- urSamplerRetain({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -1513,17 +1673,21 @@ __urdlllocal ur_result_t UR_APICALL urSamplerRelease( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_SAMPLER_RELEASE, "urSamplerRelease", ¶ms); - getContext()->logger.info("---> urSamplerRelease"); + auto &logger = getContext()->logger; + logger.info(" ---> urSamplerRelease\n"); ur_result_t result = pfnRelease(hSampler); getContext()->notify_end(UR_FUNCTION_SAMPLER_RELEASE, "urSamplerRelease", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_SAMPLER_RELEASE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_SAMPLER_RELEASE, + ¶ms); + logger.info(" <--- urSamplerRelease({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -1552,7 +1716,8 @@ __urdlllocal ur_result_t UR_APICALL urSamplerGetInfo( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_SAMPLER_GET_INFO, "urSamplerGetInfo", ¶ms); - getContext()->logger.info("---> urSamplerGetInfo"); + auto &logger = getContext()->logger; + logger.info(" ---> urSamplerGetInfo\n"); ur_result_t result = pfnGetInfo(hSampler, propName, propSize, pPropValue, pPropSizeRet); @@ -1560,10 +1725,13 @@ __urdlllocal ur_result_t UR_APICALL urSamplerGetInfo( getContext()->notify_end(UR_FUNCTION_SAMPLER_GET_INFO, "urSamplerGetInfo", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_SAMPLER_GET_INFO, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_SAMPLER_GET_INFO, + ¶ms); + logger.info(" <--- urSamplerGetInfo({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -1588,7 +1756,8 @@ __urdlllocal ur_result_t UR_APICALL urSamplerGetNativeHandle( getContext()->notify_begin(UR_FUNCTION_SAMPLER_GET_NATIVE_HANDLE, "urSamplerGetNativeHandle", ¶ms); - getContext()->logger.info("---> urSamplerGetNativeHandle"); + auto &logger = getContext()->logger; + logger.info(" ---> urSamplerGetNativeHandle\n"); ur_result_t result = pfnGetNativeHandle(hSampler, phNativeSampler); @@ -1596,10 +1765,13 @@ __urdlllocal ur_result_t UR_APICALL urSamplerGetNativeHandle( "urSamplerGetNativeHandle", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_SAMPLER_GET_NATIVE_HANDLE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_SAMPLER_GET_NATIVE_HANDLE, ¶ms); + logger.info(" <--- urSamplerGetNativeHandle({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -1628,7 +1800,8 @@ __urdlllocal ur_result_t UR_APICALL urSamplerCreateWithNativeHandle( UR_FUNCTION_SAMPLER_CREATE_WITH_NATIVE_HANDLE, "urSamplerCreateWithNativeHandle", ¶ms); - getContext()->logger.info("---> urSamplerCreateWithNativeHandle"); + auto &logger = getContext()->logger; + logger.info(" ---> urSamplerCreateWithNativeHandle\n"); ur_result_t result = pfnCreateWithNativeHandle(hNativeSampler, hContext, pProperties, phSampler); @@ -1637,10 +1810,13 @@ __urdlllocal ur_result_t UR_APICALL urSamplerCreateWithNativeHandle( "urSamplerCreateWithNativeHandle", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_SAMPLER_CREATE_WITH_NATIVE_HANDLE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_SAMPLER_CREATE_WITH_NATIVE_HANDLE, ¶ms); + logger.info(" <--- urSamplerCreateWithNativeHandle({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -1668,17 +1844,21 @@ __urdlllocal ur_result_t UR_APICALL urUSMHostAlloc( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_USM_HOST_ALLOC, "urUSMHostAlloc", ¶ms); - getContext()->logger.info("---> urUSMHostAlloc"); + auto &logger = getContext()->logger; + logger.info(" ---> urUSMHostAlloc\n"); ur_result_t result = pfnHostAlloc(hContext, pUSMDesc, pool, size, ppMem); getContext()->notify_end(UR_FUNCTION_USM_HOST_ALLOC, "urUSMHostAlloc", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_HOST_ALLOC, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_HOST_ALLOC, + ¶ms); + logger.info(" <--- urUSMHostAlloc({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -1707,7 +1887,8 @@ __urdlllocal ur_result_t UR_APICALL urUSMDeviceAlloc( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_USM_DEVICE_ALLOC, "urUSMDeviceAlloc", ¶ms); - getContext()->logger.info("---> urUSMDeviceAlloc"); + auto &logger = getContext()->logger; + logger.info(" ---> urUSMDeviceAlloc\n"); ur_result_t result = pfnDeviceAlloc(hContext, hDevice, pUSMDesc, pool, size, ppMem); @@ -1715,10 +1896,13 @@ __urdlllocal ur_result_t UR_APICALL urUSMDeviceAlloc( getContext()->notify_end(UR_FUNCTION_USM_DEVICE_ALLOC, "urUSMDeviceAlloc", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_DEVICE_ALLOC, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_DEVICE_ALLOC, + ¶ms); + logger.info(" <--- urUSMDeviceAlloc({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -1747,7 +1931,8 @@ __urdlllocal ur_result_t UR_APICALL urUSMSharedAlloc( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_USM_SHARED_ALLOC, "urUSMSharedAlloc", ¶ms); - getContext()->logger.info("---> urUSMSharedAlloc"); + auto &logger = getContext()->logger; + logger.info(" ---> urUSMSharedAlloc\n"); ur_result_t result = pfnSharedAlloc(hContext, hDevice, pUSMDesc, pool, size, ppMem); @@ -1755,10 +1940,13 @@ __urdlllocal ur_result_t UR_APICALL urUSMSharedAlloc( getContext()->notify_end(UR_FUNCTION_USM_SHARED_ALLOC, "urUSMSharedAlloc", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_SHARED_ALLOC, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_SHARED_ALLOC, + ¶ms); + logger.info(" <--- urUSMSharedAlloc({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -1779,16 +1967,20 @@ __urdlllocal ur_result_t UR_APICALL urUSMFree( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_USM_FREE, "urUSMFree", ¶ms); - getContext()->logger.info("---> urUSMFree"); + auto &logger = getContext()->logger; + logger.info(" ---> urUSMFree\n"); ur_result_t result = pfnFree(hContext, pMem); getContext()->notify_end(UR_FUNCTION_USM_FREE, "urUSMFree", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_FREE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_FREE, + ¶ms); + logger.info(" <--- urUSMFree({}) -> {};\n", args_str.str(), result); + } return result; } @@ -1819,7 +2011,8 @@ __urdlllocal ur_result_t UR_APICALL urUSMGetMemAllocInfo( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_USM_GET_MEM_ALLOC_INFO, "urUSMGetMemAllocInfo", ¶ms); - getContext()->logger.info("---> urUSMGetMemAllocInfo"); + auto &logger = getContext()->logger; + logger.info(" ---> urUSMGetMemAllocInfo\n"); ur_result_t result = pfnGetMemAllocInfo(hContext, pMem, propName, propSize, pPropValue, pPropSizeRet); @@ -1828,10 +2021,13 @@ __urdlllocal ur_result_t UR_APICALL urUSMGetMemAllocInfo( "urUSMGetMemAllocInfo", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_USM_GET_MEM_ALLOC_INFO, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_USM_GET_MEM_ALLOC_INFO, ¶ms); + logger.info(" <--- urUSMGetMemAllocInfo({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -1855,17 +2051,21 @@ __urdlllocal ur_result_t UR_APICALL urUSMPoolCreate( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_USM_POOL_CREATE, "urUSMPoolCreate", ¶ms); - getContext()->logger.info("---> urUSMPoolCreate"); + auto &logger = getContext()->logger; + logger.info(" ---> urUSMPoolCreate\n"); ur_result_t result = pfnPoolCreate(hContext, pPoolDesc, ppPool); getContext()->notify_end(UR_FUNCTION_USM_POOL_CREATE, "urUSMPoolCreate", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_POOL_CREATE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_POOL_CREATE, + ¶ms); + logger.info(" <--- urUSMPoolCreate({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -1885,17 +2085,21 @@ __urdlllocal ur_result_t UR_APICALL urUSMPoolRetain( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_USM_POOL_RETAIN, "urUSMPoolRetain", ¶ms); - getContext()->logger.info("---> urUSMPoolRetain"); + auto &logger = getContext()->logger; + logger.info(" ---> urUSMPoolRetain\n"); ur_result_t result = pfnPoolRetain(pPool); getContext()->notify_end(UR_FUNCTION_USM_POOL_RETAIN, "urUSMPoolRetain", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_POOL_RETAIN, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_POOL_RETAIN, + ¶ms); + logger.info(" <--- urUSMPoolRetain({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -1915,17 +2119,21 @@ __urdlllocal ur_result_t UR_APICALL urUSMPoolRelease( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_USM_POOL_RELEASE, "urUSMPoolRelease", ¶ms); - getContext()->logger.info("---> urUSMPoolRelease"); + auto &logger = getContext()->logger; + logger.info(" ---> urUSMPoolRelease\n"); ur_result_t result = pfnPoolRelease(pPool); getContext()->notify_end(UR_FUNCTION_USM_POOL_RELEASE, "urUSMPoolRelease", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_POOL_RELEASE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_POOL_RELEASE, + ¶ms); + logger.info(" <--- urUSMPoolRelease({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -1953,7 +2161,8 @@ __urdlllocal ur_result_t UR_APICALL urUSMPoolGetInfo( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_USM_POOL_GET_INFO, "urUSMPoolGetInfo", ¶ms); - getContext()->logger.info("---> urUSMPoolGetInfo"); + auto &logger = getContext()->logger; + logger.info(" ---> urUSMPoolGetInfo\n"); ur_result_t result = pfnPoolGetInfo(hPool, propName, propSize, pPropValue, pPropSizeRet); @@ -1961,10 +2170,13 @@ __urdlllocal ur_result_t UR_APICALL urUSMPoolGetInfo( getContext()->notify_end(UR_FUNCTION_USM_POOL_GET_INFO, "urUSMPoolGetInfo", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_POOL_GET_INFO, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_POOL_GET_INFO, + ¶ms); + logger.info(" <--- urUSMPoolGetInfo({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -2001,7 +2213,8 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( getContext()->notify_begin(UR_FUNCTION_VIRTUAL_MEM_GRANULARITY_GET_INFO, "urVirtualMemGranularityGetInfo", ¶ms); - getContext()->logger.info("---> urVirtualMemGranularityGetInfo"); + auto &logger = getContext()->logger; + logger.info(" ---> urVirtualMemGranularityGetInfo\n"); ur_result_t result = pfnGranularityGetInfo( hContext, hDevice, propName, propSize, pPropValue, pPropSizeRet); @@ -2010,10 +2223,13 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( "urVirtualMemGranularityGetInfo", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_VIRTUAL_MEM_GRANULARITY_GET_INFO, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_VIRTUAL_MEM_GRANULARITY_GET_INFO, ¶ms); + logger.info(" <--- urVirtualMemGranularityGetInfo({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -2043,17 +2259,21 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemReserve( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_VIRTUAL_MEM_RESERVE, "urVirtualMemReserve", ¶ms); - getContext()->logger.info("---> urVirtualMemReserve"); + auto &logger = getContext()->logger; + logger.info(" ---> urVirtualMemReserve\n"); ur_result_t result = pfnReserve(hContext, pStart, size, ppStart); getContext()->notify_end(UR_FUNCTION_VIRTUAL_MEM_RESERVE, "urVirtualMemReserve", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_VIRTUAL_MEM_RESERVE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_VIRTUAL_MEM_RESERVE, ¶ms); + logger.info(" <--- urVirtualMemReserve({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -2076,17 +2296,21 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemFree( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_VIRTUAL_MEM_FREE, "urVirtualMemFree", ¶ms); - getContext()->logger.info("---> urVirtualMemFree"); + auto &logger = getContext()->logger; + logger.info(" ---> urVirtualMemFree\n"); ur_result_t result = pfnFree(hContext, pStart, size); getContext()->notify_end(UR_FUNCTION_VIRTUAL_MEM_FREE, "urVirtualMemFree", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_VIRTUAL_MEM_FREE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_VIRTUAL_MEM_FREE, + ¶ms); + logger.info(" <--- urVirtualMemFree({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -2116,7 +2340,8 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemMap( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_VIRTUAL_MEM_MAP, "urVirtualMemMap", ¶ms); - getContext()->logger.info("---> urVirtualMemMap"); + auto &logger = getContext()->logger; + logger.info(" ---> urVirtualMemMap\n"); ur_result_t result = pfnMap(hContext, pStart, size, hPhysicalMem, offset, flags); @@ -2124,10 +2349,13 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemMap( getContext()->notify_end(UR_FUNCTION_VIRTUAL_MEM_MAP, "urVirtualMemMap", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_VIRTUAL_MEM_MAP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_VIRTUAL_MEM_MAP, + ¶ms); + logger.info(" <--- urVirtualMemMap({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -2150,17 +2378,21 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemUnmap( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_VIRTUAL_MEM_UNMAP, "urVirtualMemUnmap", ¶ms); - getContext()->logger.info("---> urVirtualMemUnmap"); + auto &logger = getContext()->logger; + logger.info(" ---> urVirtualMemUnmap\n"); ur_result_t result = pfnUnmap(hContext, pStart, size); getContext()->notify_end(UR_FUNCTION_VIRTUAL_MEM_UNMAP, "urVirtualMemUnmap", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_VIRTUAL_MEM_UNMAP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_VIRTUAL_MEM_UNMAP, + ¶ms); + logger.info(" <--- urVirtualMemUnmap({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -2186,7 +2418,8 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemSetAccess( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_VIRTUAL_MEM_SET_ACCESS, "urVirtualMemSetAccess", ¶ms); - getContext()->logger.info("---> urVirtualMemSetAccess"); + auto &logger = getContext()->logger; + logger.info(" ---> urVirtualMemSetAccess\n"); ur_result_t result = pfnSetAccess(hContext, pStart, size, flags); @@ -2194,10 +2427,13 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemSetAccess( "urVirtualMemSetAccess", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_VIRTUAL_MEM_SET_ACCESS, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_VIRTUAL_MEM_SET_ACCESS, ¶ms); + logger.info(" <--- urVirtualMemSetAccess({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -2232,7 +2468,8 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGetInfo( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_VIRTUAL_MEM_GET_INFO, "urVirtualMemGetInfo", ¶ms); - getContext()->logger.info("---> urVirtualMemGetInfo"); + auto &logger = getContext()->logger; + logger.info(" ---> urVirtualMemGetInfo\n"); ur_result_t result = pfnGetInfo(hContext, pStart, size, propName, propSize, pPropValue, pPropSizeRet); @@ -2240,10 +2477,13 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGetInfo( getContext()->notify_end(UR_FUNCTION_VIRTUAL_MEM_GET_INFO, "urVirtualMemGetInfo", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_VIRTUAL_MEM_GET_INFO, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_VIRTUAL_MEM_GET_INFO, ¶ms); + logger.info(" <--- urVirtualMemGetInfo({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -2272,7 +2512,8 @@ __urdlllocal ur_result_t UR_APICALL urPhysicalMemCreate( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_PHYSICAL_MEM_CREATE, "urPhysicalMemCreate", ¶ms); - getContext()->logger.info("---> urPhysicalMemCreate"); + auto &logger = getContext()->logger; + logger.info(" ---> urPhysicalMemCreate\n"); ur_result_t result = pfnCreate(hContext, hDevice, size, pProperties, phPhysicalMem); @@ -2280,10 +2521,13 @@ __urdlllocal ur_result_t UR_APICALL urPhysicalMemCreate( getContext()->notify_end(UR_FUNCTION_PHYSICAL_MEM_CREATE, "urPhysicalMemCreate", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_PHYSICAL_MEM_CREATE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PHYSICAL_MEM_CREATE, ¶ms); + logger.info(" <--- urPhysicalMemCreate({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -2304,17 +2548,21 @@ __urdlllocal ur_result_t UR_APICALL urPhysicalMemRetain( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_PHYSICAL_MEM_RETAIN, "urPhysicalMemRetain", ¶ms); - getContext()->logger.info("---> urPhysicalMemRetain"); + auto &logger = getContext()->logger; + logger.info(" ---> urPhysicalMemRetain\n"); ur_result_t result = pfnRetain(hPhysicalMem); getContext()->notify_end(UR_FUNCTION_PHYSICAL_MEM_RETAIN, "urPhysicalMemRetain", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_PHYSICAL_MEM_RETAIN, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PHYSICAL_MEM_RETAIN, ¶ms); + logger.info(" <--- urPhysicalMemRetain({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -2335,7 +2583,8 @@ __urdlllocal ur_result_t UR_APICALL urPhysicalMemRelease( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_PHYSICAL_MEM_RELEASE, "urPhysicalMemRelease", ¶ms); - getContext()->logger.info("---> urPhysicalMemRelease"); + auto &logger = getContext()->logger; + logger.info(" ---> urPhysicalMemRelease\n"); ur_result_t result = pfnRelease(hPhysicalMem); @@ -2343,10 +2592,13 @@ __urdlllocal ur_result_t UR_APICALL urPhysicalMemRelease( "urPhysicalMemRelease", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_PHYSICAL_MEM_RELEASE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PHYSICAL_MEM_RELEASE, ¶ms); + logger.info(" <--- urPhysicalMemRelease({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -2373,7 +2625,8 @@ __urdlllocal ur_result_t UR_APICALL urProgramCreateWithIL( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_PROGRAM_CREATE_WITH_IL, "urProgramCreateWithIL", ¶ms); - getContext()->logger.info("---> urProgramCreateWithIL"); + auto &logger = getContext()->logger; + logger.info(" ---> urProgramCreateWithIL\n"); ur_result_t result = pfnCreateWithIL(hContext, pIL, length, pProperties, phProgram); @@ -2382,10 +2635,13 @@ __urdlllocal ur_result_t UR_APICALL urProgramCreateWithIL( "urProgramCreateWithIL", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_PROGRAM_CREATE_WITH_IL, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PROGRAM_CREATE_WITH_IL, ¶ms); + logger.info(" <--- urProgramCreateWithIL({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -2416,7 +2672,8 @@ __urdlllocal ur_result_t UR_APICALL urProgramCreateWithBinary( getContext()->notify_begin(UR_FUNCTION_PROGRAM_CREATE_WITH_BINARY, "urProgramCreateWithBinary", ¶ms); - getContext()->logger.info("---> urProgramCreateWithBinary"); + auto &logger = getContext()->logger; + logger.info(" ---> urProgramCreateWithBinary\n"); ur_result_t result = pfnCreateWithBinary(hContext, hDevice, size, pBinary, pProperties, phProgram); @@ -2425,10 +2682,13 @@ __urdlllocal ur_result_t UR_APICALL urProgramCreateWithBinary( "urProgramCreateWithBinary", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_PROGRAM_CREATE_WITH_BINARY, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PROGRAM_CREATE_WITH_BINARY, ¶ms); + logger.info(" <--- urProgramCreateWithBinary({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -2451,17 +2711,21 @@ __urdlllocal ur_result_t UR_APICALL urProgramBuild( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_PROGRAM_BUILD, "urProgramBuild", ¶ms); - getContext()->logger.info("---> urProgramBuild"); + auto &logger = getContext()->logger; + logger.info(" ---> urProgramBuild\n"); ur_result_t result = pfnBuild(hContext, hProgram, pOptions); getContext()->notify_end(UR_FUNCTION_PROGRAM_BUILD, "urProgramBuild", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_BUILD, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_BUILD, + ¶ms); + logger.info(" <--- urProgramBuild({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -2485,17 +2749,21 @@ __urdlllocal ur_result_t UR_APICALL urProgramCompile( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_PROGRAM_COMPILE, "urProgramCompile", ¶ms); - getContext()->logger.info("---> urProgramCompile"); + auto &logger = getContext()->logger; + logger.info(" ---> urProgramCompile\n"); ur_result_t result = pfnCompile(hContext, hProgram, pOptions); getContext()->notify_end(UR_FUNCTION_PROGRAM_COMPILE, "urProgramCompile", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_COMPILE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_COMPILE, + ¶ms); + logger.info(" <--- urProgramCompile({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -2526,7 +2794,8 @@ __urdlllocal ur_result_t UR_APICALL urProgramLink( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_PROGRAM_LINK, "urProgramLink", ¶ms); - getContext()->logger.info("---> urProgramLink"); + auto &logger = getContext()->logger; + logger.info(" ---> urProgramLink\n"); ur_result_t result = pfnLink(hContext, count, phPrograms, pOptions, phProgram); @@ -2534,10 +2803,13 @@ __urdlllocal ur_result_t UR_APICALL urProgramLink( getContext()->notify_end(UR_FUNCTION_PROGRAM_LINK, "urProgramLink", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_LINK, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_LINK, + ¶ms); + logger.info(" <--- urProgramLink({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -2558,17 +2830,21 @@ __urdlllocal ur_result_t UR_APICALL urProgramRetain( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_PROGRAM_RETAIN, "urProgramRetain", ¶ms); - getContext()->logger.info("---> urProgramRetain"); + auto &logger = getContext()->logger; + logger.info(" ---> urProgramRetain\n"); ur_result_t result = pfnRetain(hProgram); getContext()->notify_end(UR_FUNCTION_PROGRAM_RETAIN, "urProgramRetain", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_RETAIN, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_RETAIN, + ¶ms); + logger.info(" <--- urProgramRetain({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -2589,17 +2865,21 @@ __urdlllocal ur_result_t UR_APICALL urProgramRelease( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_PROGRAM_RELEASE, "urProgramRelease", ¶ms); - getContext()->logger.info("---> urProgramRelease"); + auto &logger = getContext()->logger; + logger.info(" ---> urProgramRelease\n"); ur_result_t result = pfnRelease(hProgram); getContext()->notify_end(UR_FUNCTION_PROGRAM_RELEASE, "urProgramRelease", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_RELEASE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_RELEASE, + ¶ms); + logger.info(" <--- urProgramRelease({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -2631,7 +2911,8 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetFunctionPointer( getContext()->notify_begin(UR_FUNCTION_PROGRAM_GET_FUNCTION_POINTER, "urProgramGetFunctionPointer", ¶ms); - getContext()->logger.info("---> urProgramGetFunctionPointer"); + auto &logger = getContext()->logger; + logger.info(" ---> urProgramGetFunctionPointer\n"); ur_result_t result = pfnGetFunctionPointer(hDevice, hProgram, pFunctionName, ppFunctionPointer); @@ -2640,10 +2921,13 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetFunctionPointer( "urProgramGetFunctionPointer", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_PROGRAM_GET_FUNCTION_POINTER, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PROGRAM_GET_FUNCTION_POINTER, ¶ms); + logger.info(" <--- urProgramGetFunctionPointer({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -2677,7 +2961,8 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetGlobalVariablePointer( UR_FUNCTION_PROGRAM_GET_GLOBAL_VARIABLE_POINTER, "urProgramGetGlobalVariablePointer", ¶ms); - getContext()->logger.info("---> urProgramGetGlobalVariablePointer"); + auto &logger = getContext()->logger; + logger.info(" ---> urProgramGetGlobalVariablePointer\n"); ur_result_t result = pfnGetGlobalVariablePointer( hDevice, hProgram, pGlobalVariableName, pGlobalVariableSizeRet, @@ -2687,10 +2972,13 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetGlobalVariablePointer( "urProgramGetGlobalVariablePointer", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_PROGRAM_GET_GLOBAL_VARIABLE_POINTER, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PROGRAM_GET_GLOBAL_VARIABLE_POINTER, ¶ms); + logger.info(" <--- urProgramGetGlobalVariablePointer({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -2722,7 +3010,8 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetInfo( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_PROGRAM_GET_INFO, "urProgramGetInfo", ¶ms); - getContext()->logger.info("---> urProgramGetInfo"); + auto &logger = getContext()->logger; + logger.info(" ---> urProgramGetInfo\n"); ur_result_t result = pfnGetInfo(hProgram, propName, propSize, pPropValue, pPropSizeRet); @@ -2730,10 +3019,13 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetInfo( getContext()->notify_end(UR_FUNCTION_PROGRAM_GET_INFO, "urProgramGetInfo", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_GET_INFO, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_GET_INFO, + ¶ms); + logger.info(" <--- urProgramGetInfo({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -2767,7 +3059,8 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetBuildInfo( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_PROGRAM_GET_BUILD_INFO, "urProgramGetBuildInfo", ¶ms); - getContext()->logger.info("---> urProgramGetBuildInfo"); + auto &logger = getContext()->logger; + logger.info(" ---> urProgramGetBuildInfo\n"); ur_result_t result = pfnGetBuildInfo(hProgram, hDevice, propName, propSize, pPropValue, pPropSizeRet); @@ -2776,10 +3069,13 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetBuildInfo( "urProgramGetBuildInfo", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_PROGRAM_GET_BUILD_INFO, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PROGRAM_GET_BUILD_INFO, ¶ms); + logger.info(" <--- urProgramGetBuildInfo({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -2806,7 +3102,8 @@ __urdlllocal ur_result_t UR_APICALL urProgramSetSpecializationConstants( UR_FUNCTION_PROGRAM_SET_SPECIALIZATION_CONSTANTS, "urProgramSetSpecializationConstants", ¶ms); - getContext()->logger.info("---> urProgramSetSpecializationConstants"); + auto &logger = getContext()->logger; + logger.info(" ---> urProgramSetSpecializationConstants\n"); ur_result_t result = pfnSetSpecializationConstants(hProgram, count, pSpecConstants); @@ -2815,10 +3112,14 @@ __urdlllocal ur_result_t UR_APICALL urProgramSetSpecializationConstants( "urProgramSetSpecializationConstants", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_PROGRAM_SET_SPECIALIZATION_CONSTANTS, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PROGRAM_SET_SPECIALIZATION_CONSTANTS, + ¶ms); + logger.info(" <--- urProgramSetSpecializationConstants({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -2843,7 +3144,8 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetNativeHandle( getContext()->notify_begin(UR_FUNCTION_PROGRAM_GET_NATIVE_HANDLE, "urProgramGetNativeHandle", ¶ms); - getContext()->logger.info("---> urProgramGetNativeHandle"); + auto &logger = getContext()->logger; + logger.info(" ---> urProgramGetNativeHandle\n"); ur_result_t result = pfnGetNativeHandle(hProgram, phNativeProgram); @@ -2851,10 +3153,13 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetNativeHandle( "urProgramGetNativeHandle", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_PROGRAM_GET_NATIVE_HANDLE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PROGRAM_GET_NATIVE_HANDLE, ¶ms); + logger.info(" <--- urProgramGetNativeHandle({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -2883,7 +3188,8 @@ __urdlllocal ur_result_t UR_APICALL urProgramCreateWithNativeHandle( UR_FUNCTION_PROGRAM_CREATE_WITH_NATIVE_HANDLE, "urProgramCreateWithNativeHandle", ¶ms); - getContext()->logger.info("---> urProgramCreateWithNativeHandle"); + auto &logger = getContext()->logger; + logger.info(" ---> urProgramCreateWithNativeHandle\n"); ur_result_t result = pfnCreateWithNativeHandle(hNativeProgram, hContext, pProperties, phProgram); @@ -2892,10 +3198,13 @@ __urdlllocal ur_result_t UR_APICALL urProgramCreateWithNativeHandle( "urProgramCreateWithNativeHandle", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_PROGRAM_CREATE_WITH_NATIVE_HANDLE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PROGRAM_CREATE_WITH_NATIVE_HANDLE, ¶ms); + logger.info(" <--- urProgramCreateWithNativeHandle({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -2918,17 +3227,21 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreate( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_KERNEL_CREATE, "urKernelCreate", ¶ms); - getContext()->logger.info("---> urKernelCreate"); + auto &logger = getContext()->logger; + logger.info(" ---> urKernelCreate\n"); ur_result_t result = pfnCreate(hProgram, pKernelName, phKernel); getContext()->notify_end(UR_FUNCTION_KERNEL_CREATE, "urKernelCreate", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_CREATE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_CREATE, + ¶ms); + logger.info(" <--- urKernelCreate({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -2943,6 +3256,7 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgValue( *pProperties, ///< [in][optional] pointer to value properties. const void *pArgValue ///< [in] argument value represented as matching arg type. + ///< The data pointed to will be copied and therefore can be reused on return. ) { auto pfnSetArgValue = getContext()->urDdiTable.Kernel.pfnSetArgValue; @@ -2955,7 +3269,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgValue( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_KERNEL_SET_ARG_VALUE, "urKernelSetArgValue", ¶ms); - getContext()->logger.info("---> urKernelSetArgValue"); + auto &logger = getContext()->logger; + logger.info(" ---> urKernelSetArgValue\n"); ur_result_t result = pfnSetArgValue(hKernel, argIndex, argSize, pProperties, pArgValue); @@ -2963,10 +3278,13 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgValue( getContext()->notify_end(UR_FUNCTION_KERNEL_SET_ARG_VALUE, "urKernelSetArgValue", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_SET_ARG_VALUE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_SET_ARG_VALUE, ¶ms); + logger.info(" <--- urKernelSetArgValue({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -2992,7 +3310,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgLocal( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_KERNEL_SET_ARG_LOCAL, "urKernelSetArgLocal", ¶ms); - getContext()->logger.info("---> urKernelSetArgLocal"); + auto &logger = getContext()->logger; + logger.info(" ---> urKernelSetArgLocal\n"); ur_result_t result = pfnSetArgLocal(hKernel, argIndex, argSize, pProperties); @@ -3000,10 +3319,13 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgLocal( getContext()->notify_end(UR_FUNCTION_KERNEL_SET_ARG_LOCAL, "urKernelSetArgLocal", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_SET_ARG_LOCAL, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_SET_ARG_LOCAL, ¶ms); + logger.info(" <--- urKernelSetArgLocal({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -3036,7 +3358,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetInfo( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_KERNEL_GET_INFO, "urKernelGetInfo", ¶ms); - getContext()->logger.info("---> urKernelGetInfo"); + auto &logger = getContext()->logger; + logger.info(" ---> urKernelGetInfo\n"); ur_result_t result = pfnGetInfo(hKernel, propName, propSize, pPropValue, pPropSizeRet); @@ -3044,10 +3367,13 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetInfo( getContext()->notify_end(UR_FUNCTION_KERNEL_GET_INFO, "urKernelGetInfo", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_GET_INFO, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_GET_INFO, + ¶ms); + logger.info(" <--- urKernelGetInfo({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -3078,7 +3404,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetGroupInfo( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_KERNEL_GET_GROUP_INFO, "urKernelGetGroupInfo", ¶ms); - getContext()->logger.info("---> urKernelGetGroupInfo"); + auto &logger = getContext()->logger; + logger.info(" ---> urKernelGetGroupInfo\n"); ur_result_t result = pfnGetGroupInfo(hKernel, hDevice, propName, propSize, pPropValue, pPropSizeRet); @@ -3087,10 +3414,13 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetGroupInfo( "urKernelGetGroupInfo", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_GET_GROUP_INFO, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_GET_GROUP_INFO, ¶ms); + logger.info(" <--- urKernelGetGroupInfo({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -3123,7 +3453,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSubGroupInfo( getContext()->notify_begin(UR_FUNCTION_KERNEL_GET_SUB_GROUP_INFO, "urKernelGetSubGroupInfo", ¶ms); - getContext()->logger.info("---> urKernelGetSubGroupInfo"); + auto &logger = getContext()->logger; + logger.info(" ---> urKernelGetSubGroupInfo\n"); ur_result_t result = pfnGetSubGroupInfo(hKernel, hDevice, propName, propSize, pPropValue, pPropSizeRet); @@ -3132,10 +3463,13 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSubGroupInfo( "urKernelGetSubGroupInfo", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_KERNEL_GET_SUB_GROUP_INFO, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_GET_SUB_GROUP_INFO, ¶ms); + logger.info(" <--- urKernelGetSubGroupInfo({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -3155,17 +3489,21 @@ __urdlllocal ur_result_t UR_APICALL urKernelRetain( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_KERNEL_RETAIN, "urKernelRetain", ¶ms); - getContext()->logger.info("---> urKernelRetain"); + auto &logger = getContext()->logger; + logger.info(" ---> urKernelRetain\n"); ur_result_t result = pfnRetain(hKernel); getContext()->notify_end(UR_FUNCTION_KERNEL_RETAIN, "urKernelRetain", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_RETAIN, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_RETAIN, + ¶ms); + logger.info(" <--- urKernelRetain({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -3186,17 +3524,21 @@ __urdlllocal ur_result_t UR_APICALL urKernelRelease( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_KERNEL_RELEASE, "urKernelRelease", ¶ms); - getContext()->logger.info("---> urKernelRelease"); + auto &logger = getContext()->logger; + logger.info(" ---> urKernelRelease\n"); ur_result_t result = pfnRelease(hKernel); getContext()->notify_end(UR_FUNCTION_KERNEL_RELEASE, "urKernelRelease", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_RELEASE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_RELEASE, + ¶ms); + logger.info(" <--- urKernelRelease({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -3223,7 +3565,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgPointer( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_KERNEL_SET_ARG_POINTER, "urKernelSetArgPointer", ¶ms); - getContext()->logger.info("---> urKernelSetArgPointer"); + auto &logger = getContext()->logger; + logger.info(" ---> urKernelSetArgPointer\n"); ur_result_t result = pfnSetArgPointer(hKernel, argIndex, pProperties, pArgValue); @@ -3232,10 +3575,13 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgPointer( "urKernelSetArgPointer", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_KERNEL_SET_ARG_POINTER, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_SET_ARG_POINTER, ¶ms); + logger.info(" <--- urKernelSetArgPointer({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -3263,7 +3609,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetExecInfo( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_KERNEL_SET_EXEC_INFO, "urKernelSetExecInfo", ¶ms); - getContext()->logger.info("---> urKernelSetExecInfo"); + auto &logger = getContext()->logger; + logger.info(" ---> urKernelSetExecInfo\n"); ur_result_t result = pfnSetExecInfo(hKernel, propName, propSize, pProperties, pPropValue); @@ -3271,10 +3618,13 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetExecInfo( getContext()->notify_end(UR_FUNCTION_KERNEL_SET_EXEC_INFO, "urKernelSetExecInfo", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_SET_EXEC_INFO, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_SET_EXEC_INFO, ¶ms); + logger.info(" <--- urKernelSetExecInfo({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -3299,7 +3649,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgSampler( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_KERNEL_SET_ARG_SAMPLER, "urKernelSetArgSampler", ¶ms); - getContext()->logger.info("---> urKernelSetArgSampler"); + auto &logger = getContext()->logger; + logger.info(" ---> urKernelSetArgSampler\n"); ur_result_t result = pfnSetArgSampler(hKernel, argIndex, pProperties, hArgValue); @@ -3308,10 +3659,13 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgSampler( "urKernelSetArgSampler", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_KERNEL_SET_ARG_SAMPLER, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_SET_ARG_SAMPLER, ¶ms); + logger.info(" <--- urKernelSetArgSampler({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -3336,7 +3690,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgMemObj( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_KERNEL_SET_ARG_MEM_OBJ, "urKernelSetArgMemObj", ¶ms); - getContext()->logger.info("---> urKernelSetArgMemObj"); + auto &logger = getContext()->logger; + logger.info(" ---> urKernelSetArgMemObj\n"); ur_result_t result = pfnSetArgMemObj(hKernel, argIndex, pProperties, hArgValue); @@ -3345,10 +3700,13 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgMemObj( "urKernelSetArgMemObj", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_KERNEL_SET_ARG_MEM_OBJ, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_SET_ARG_MEM_OBJ, ¶ms); + logger.info(" <--- urKernelSetArgMemObj({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -3374,7 +3732,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetSpecializationConstants( UR_FUNCTION_KERNEL_SET_SPECIALIZATION_CONSTANTS, "urKernelSetSpecializationConstants", ¶ms); - getContext()->logger.info("---> urKernelSetSpecializationConstants"); + auto &logger = getContext()->logger; + logger.info(" ---> urKernelSetSpecializationConstants\n"); ur_result_t result = pfnSetSpecializationConstants(hKernel, count, pSpecConstants); @@ -3383,10 +3742,13 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetSpecializationConstants( "urKernelSetSpecializationConstants", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_KERNEL_SET_SPECIALIZATION_CONSTANTS, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_SET_SPECIALIZATION_CONSTANTS, ¶ms); + logger.info(" <--- urKernelSetSpecializationConstants({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -3410,7 +3772,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetNativeHandle( getContext()->notify_begin(UR_FUNCTION_KERNEL_GET_NATIVE_HANDLE, "urKernelGetNativeHandle", ¶ms); - getContext()->logger.info("---> urKernelGetNativeHandle"); + auto &logger = getContext()->logger; + logger.info(" ---> urKernelGetNativeHandle\n"); ur_result_t result = pfnGetNativeHandle(hKernel, phNativeKernel); @@ -3418,10 +3781,13 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetNativeHandle( "urKernelGetNativeHandle", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_KERNEL_GET_NATIVE_HANDLE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_GET_NATIVE_HANDLE, ¶ms); + logger.info(" <--- urKernelGetNativeHandle({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -3452,7 +3818,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreateWithNativeHandle( getContext()->notify_begin(UR_FUNCTION_KERNEL_CREATE_WITH_NATIVE_HANDLE, "urKernelCreateWithNativeHandle", ¶ms); - getContext()->logger.info("---> urKernelCreateWithNativeHandle"); + auto &logger = getContext()->logger; + logger.info(" ---> urKernelCreateWithNativeHandle\n"); ur_result_t result = pfnCreateWithNativeHandle( hNativeKernel, hContext, hProgram, pProperties, phKernel); @@ -3461,10 +3828,13 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreateWithNativeHandle( "urKernelCreateWithNativeHandle", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_KERNEL_CREATE_WITH_NATIVE_HANDLE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_CREATE_WITH_NATIVE_HANDLE, ¶ms); + logger.info(" <--- urKernelCreateWithNativeHandle({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -3502,7 +3872,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE, "urKernelGetSuggestedLocalWorkSize", ¶ms); - getContext()->logger.info("---> urKernelGetSuggestedLocalWorkSize"); + auto &logger = getContext()->logger; + logger.info(" ---> urKernelGetSuggestedLocalWorkSize\n"); ur_result_t result = pfnGetSuggestedLocalWorkSize( hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize, @@ -3512,10 +3883,14 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( "urKernelGetSuggestedLocalWorkSize", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE, + ¶ms); + logger.info(" <--- urKernelGetSuggestedLocalWorkSize({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -3544,7 +3919,8 @@ __urdlllocal ur_result_t UR_APICALL urQueueGetInfo( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_QUEUE_GET_INFO, "urQueueGetInfo", ¶ms); - getContext()->logger.info("---> urQueueGetInfo"); + auto &logger = getContext()->logger; + logger.info(" ---> urQueueGetInfo\n"); ur_result_t result = pfnGetInfo(hQueue, propName, propSize, pPropValue, pPropSizeRet); @@ -3552,10 +3928,13 @@ __urdlllocal ur_result_t UR_APICALL urQueueGetInfo( getContext()->notify_end(UR_FUNCTION_QUEUE_GET_INFO, "urQueueGetInfo", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_GET_INFO, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_GET_INFO, + ¶ms); + logger.info(" <--- urQueueGetInfo({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -3581,17 +3960,21 @@ __urdlllocal ur_result_t UR_APICALL urQueueCreate( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_QUEUE_CREATE, "urQueueCreate", ¶ms); - getContext()->logger.info("---> urQueueCreate"); + auto &logger = getContext()->logger; + logger.info(" ---> urQueueCreate\n"); ur_result_t result = pfnCreate(hContext, hDevice, pProperties, phQueue); getContext()->notify_end(UR_FUNCTION_QUEUE_CREATE, "urQueueCreate", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_CREATE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_CREATE, + ¶ms); + logger.info(" <--- urQueueCreate({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -3612,17 +3995,21 @@ __urdlllocal ur_result_t UR_APICALL urQueueRetain( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_QUEUE_RETAIN, "urQueueRetain", ¶ms); - getContext()->logger.info("---> urQueueRetain"); + auto &logger = getContext()->logger; + logger.info(" ---> urQueueRetain\n"); ur_result_t result = pfnRetain(hQueue); getContext()->notify_end(UR_FUNCTION_QUEUE_RETAIN, "urQueueRetain", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_RETAIN, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_RETAIN, + ¶ms); + logger.info(" <--- urQueueRetain({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -3643,17 +4030,21 @@ __urdlllocal ur_result_t UR_APICALL urQueueRelease( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_QUEUE_RELEASE, "urQueueRelease", ¶ms); - getContext()->logger.info("---> urQueueRelease"); + auto &logger = getContext()->logger; + logger.info(" ---> urQueueRelease\n"); ur_result_t result = pfnRelease(hQueue); getContext()->notify_end(UR_FUNCTION_QUEUE_RELEASE, "urQueueRelease", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_RELEASE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_RELEASE, + ¶ms); + logger.info(" <--- urQueueRelease({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -3678,7 +4069,8 @@ __urdlllocal ur_result_t UR_APICALL urQueueGetNativeHandle( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_QUEUE_GET_NATIVE_HANDLE, "urQueueGetNativeHandle", ¶ms); - getContext()->logger.info("---> urQueueGetNativeHandle"); + auto &logger = getContext()->logger; + logger.info(" ---> urQueueGetNativeHandle\n"); ur_result_t result = pfnGetNativeHandle(hQueue, pDesc, phNativeQueue); @@ -3686,10 +4078,13 @@ __urdlllocal ur_result_t UR_APICALL urQueueGetNativeHandle( "urQueueGetNativeHandle", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_QUEUE_GET_NATIVE_HANDLE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_QUEUE_GET_NATIVE_HANDLE, ¶ms); + logger.info(" <--- urQueueGetNativeHandle({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -3719,7 +4114,8 @@ __urdlllocal ur_result_t UR_APICALL urQueueCreateWithNativeHandle( getContext()->notify_begin(UR_FUNCTION_QUEUE_CREATE_WITH_NATIVE_HANDLE, "urQueueCreateWithNativeHandle", ¶ms); - getContext()->logger.info("---> urQueueCreateWithNativeHandle"); + auto &logger = getContext()->logger; + logger.info(" ---> urQueueCreateWithNativeHandle\n"); ur_result_t result = pfnCreateWithNativeHandle( hNativeQueue, hContext, hDevice, pProperties, phQueue); @@ -3728,10 +4124,13 @@ __urdlllocal ur_result_t UR_APICALL urQueueCreateWithNativeHandle( "urQueueCreateWithNativeHandle", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_QUEUE_CREATE_WITH_NATIVE_HANDLE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_QUEUE_CREATE_WITH_NATIVE_HANDLE, ¶ms); + logger.info(" <--- urQueueCreateWithNativeHandle({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -3751,17 +4150,21 @@ __urdlllocal ur_result_t UR_APICALL urQueueFinish( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_QUEUE_FINISH, "urQueueFinish", ¶ms); - getContext()->logger.info("---> urQueueFinish"); + auto &logger = getContext()->logger; + logger.info(" ---> urQueueFinish\n"); ur_result_t result = pfnFinish(hQueue); getContext()->notify_end(UR_FUNCTION_QUEUE_FINISH, "urQueueFinish", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_FINISH, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_FINISH, + ¶ms); + logger.info(" <--- urQueueFinish({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -3781,16 +4184,21 @@ __urdlllocal ur_result_t UR_APICALL urQueueFlush( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_QUEUE_FLUSH, "urQueueFlush", ¶ms); - getContext()->logger.info("---> urQueueFlush"); + auto &logger = getContext()->logger; + logger.info(" ---> urQueueFlush\n"); ur_result_t result = pfnFlush(hQueue); getContext()->notify_end(UR_FUNCTION_QUEUE_FLUSH, "urQueueFlush", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_FLUSH, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_FLUSH, + ¶ms); + logger.info(" <--- urQueueFlush({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -3817,7 +4225,8 @@ __urdlllocal ur_result_t UR_APICALL urEventGetInfo( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_EVENT_GET_INFO, "urEventGetInfo", ¶ms); - getContext()->logger.info("---> urEventGetInfo"); + auto &logger = getContext()->logger; + logger.info(" ---> urEventGetInfo\n"); ur_result_t result = pfnGetInfo(hEvent, propName, propSize, pPropValue, pPropSizeRet); @@ -3825,10 +4234,13 @@ __urdlllocal ur_result_t UR_APICALL urEventGetInfo( getContext()->notify_end(UR_FUNCTION_EVENT_GET_INFO, "urEventGetInfo", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_EVENT_GET_INFO, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_EVENT_GET_INFO, + ¶ms); + logger.info(" <--- urEventGetInfo({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -3860,7 +4272,8 @@ __urdlllocal ur_result_t UR_APICALL urEventGetProfilingInfo( getContext()->notify_begin(UR_FUNCTION_EVENT_GET_PROFILING_INFO, "urEventGetProfilingInfo", ¶ms); - getContext()->logger.info("---> urEventGetProfilingInfo"); + auto &logger = getContext()->logger; + logger.info(" ---> urEventGetProfilingInfo\n"); ur_result_t result = pfnGetProfilingInfo(hEvent, propName, propSize, pPropValue, pPropSizeRet); @@ -3869,10 +4282,13 @@ __urdlllocal ur_result_t UR_APICALL urEventGetProfilingInfo( "urEventGetProfilingInfo", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_EVENT_GET_PROFILING_INFO, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_EVENT_GET_PROFILING_INFO, ¶ms); + logger.info(" <--- urEventGetProfilingInfo({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -3895,16 +4311,20 @@ __urdlllocal ur_result_t UR_APICALL urEventWait( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_EVENT_WAIT, "urEventWait", ¶ms); - getContext()->logger.info("---> urEventWait"); + auto &logger = getContext()->logger; + logger.info(" ---> urEventWait\n"); ur_result_t result = pfnWait(numEvents, phEventWaitList); getContext()->notify_end(UR_FUNCTION_EVENT_WAIT, "urEventWait", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_EVENT_WAIT, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_EVENT_WAIT, + ¶ms); + logger.info(" <--- urEventWait({}) -> {};\n", args_str.str(), result); + } return result; } @@ -3924,17 +4344,21 @@ __urdlllocal ur_result_t UR_APICALL urEventRetain( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_EVENT_RETAIN, "urEventRetain", ¶ms); - getContext()->logger.info("---> urEventRetain"); + auto &logger = getContext()->logger; + logger.info(" ---> urEventRetain\n"); ur_result_t result = pfnRetain(hEvent); getContext()->notify_end(UR_FUNCTION_EVENT_RETAIN, "urEventRetain", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_EVENT_RETAIN, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_EVENT_RETAIN, + ¶ms); + logger.info(" <--- urEventRetain({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -3954,17 +4378,21 @@ __urdlllocal ur_result_t UR_APICALL urEventRelease( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_EVENT_RELEASE, "urEventRelease", ¶ms); - getContext()->logger.info("---> urEventRelease"); + auto &logger = getContext()->logger; + logger.info(" ---> urEventRelease\n"); ur_result_t result = pfnRelease(hEvent); getContext()->notify_end(UR_FUNCTION_EVENT_RELEASE, "urEventRelease", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_EVENT_RELEASE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_EVENT_RELEASE, + ¶ms); + logger.info(" <--- urEventRelease({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -3986,7 +4414,8 @@ __urdlllocal ur_result_t UR_APICALL urEventGetNativeHandle( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_EVENT_GET_NATIVE_HANDLE, "urEventGetNativeHandle", ¶ms); - getContext()->logger.info("---> urEventGetNativeHandle"); + auto &logger = getContext()->logger; + logger.info(" ---> urEventGetNativeHandle\n"); ur_result_t result = pfnGetNativeHandle(hEvent, phNativeEvent); @@ -3994,10 +4423,13 @@ __urdlllocal ur_result_t UR_APICALL urEventGetNativeHandle( "urEventGetNativeHandle", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_EVENT_GET_NATIVE_HANDLE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_EVENT_GET_NATIVE_HANDLE, ¶ms); + logger.info(" <--- urEventGetNativeHandle({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -4026,7 +4458,8 @@ __urdlllocal ur_result_t UR_APICALL urEventCreateWithNativeHandle( getContext()->notify_begin(UR_FUNCTION_EVENT_CREATE_WITH_NATIVE_HANDLE, "urEventCreateWithNativeHandle", ¶ms); - getContext()->logger.info("---> urEventCreateWithNativeHandle"); + auto &logger = getContext()->logger; + logger.info(" ---> urEventCreateWithNativeHandle\n"); ur_result_t result = pfnCreateWithNativeHandle(hNativeEvent, hContext, pProperties, phEvent); @@ -4035,10 +4468,13 @@ __urdlllocal ur_result_t UR_APICALL urEventCreateWithNativeHandle( "urEventCreateWithNativeHandle", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_EVENT_CREATE_WITH_NATIVE_HANDLE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_EVENT_CREATE_WITH_NATIVE_HANDLE, ¶ms); + logger.info(" <--- urEventCreateWithNativeHandle({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -4063,7 +4499,8 @@ __urdlllocal ur_result_t UR_APICALL urEventSetCallback( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_EVENT_SET_CALLBACK, "urEventSetCallback", ¶ms); - getContext()->logger.info("---> urEventSetCallback"); + auto &logger = getContext()->logger; + logger.info(" ---> urEventSetCallback\n"); ur_result_t result = pfnSetCallback(hEvent, execStatus, pfnNotify, pUserData); @@ -4071,10 +4508,13 @@ __urdlllocal ur_result_t UR_APICALL urEventSetCallback( getContext()->notify_end(UR_FUNCTION_EVENT_SET_CALLBACK, "urEventSetCallback", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_EVENT_SET_CALLBACK, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_EVENT_SET_CALLBACK, ¶ms); + logger.info(" <--- urEventSetCallback({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -4098,17 +4538,16 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that ///< specify the number of local work-items forming a work-group that will ///< execute the kernel function. - ///< If nullptr, the runtime implementation will choose the work-group - ///< size. + ///< If nullptr, the runtime implementation will choose the work-group size. uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { auto pfnKernelLaunch = getContext()->urDdiTable.Enqueue.pfnKernelLaunch; @@ -4128,7 +4567,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH, "urEnqueueKernelLaunch", ¶ms); - getContext()->logger.info("---> urEnqueueKernelLaunch"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueKernelLaunch\n"); ur_result_t result = pfnKernelLaunch( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, @@ -4138,10 +4578,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( "urEnqueueKernelLaunch", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH, ¶ms); + logger.info(" <--- urEnqueueKernelLaunch({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -4159,7 +4602,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWait( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnEventsWait = getContext()->urDdiTable.Enqueue.pfnEventsWait; @@ -4172,7 +4616,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWait( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_ENQUEUE_EVENTS_WAIT, "urEnqueueEventsWait", ¶ms); - getContext()->logger.info("---> urEnqueueEventsWait"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueEventsWait\n"); ur_result_t result = pfnEventsWait(hQueue, numEventsInWaitList, phEventWaitList, phEvent); @@ -4180,10 +4625,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWait( getContext()->notify_end(UR_FUNCTION_ENQUEUE_EVENTS_WAIT, "urEnqueueEventsWait", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_EVENTS_WAIT, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_EVENTS_WAIT, ¶ms); + logger.info(" <--- urEnqueueEventsWait({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -4201,7 +4649,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnEventsWaitWithBarrier = getContext()->urDdiTable.Enqueue.pfnEventsWaitWithBarrier; @@ -4216,7 +4665,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( getContext()->notify_begin(UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER, "urEnqueueEventsWaitWithBarrier", ¶ms); - getContext()->logger.info("---> urEnqueueEventsWaitWithBarrier"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueEventsWaitWithBarrier\n"); ur_result_t result = pfnEventsWaitWithBarrier(hQueue, numEventsInWaitList, phEventWaitList, phEvent); @@ -4225,10 +4675,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( "urEnqueueEventsWaitWithBarrier", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER, ¶ms); + logger.info(" <--- urEnqueueEventsWaitWithBarrier({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -4251,7 +4704,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferRead( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnMemBufferRead = getContext()->urDdiTable.Enqueue.pfnMemBufferRead; @@ -4266,7 +4720,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferRead( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_ENQUEUE_MEM_BUFFER_READ, "urEnqueueMemBufferRead", ¶ms); - getContext()->logger.info("---> urEnqueueMemBufferRead"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueMemBufferRead\n"); ur_result_t result = pfnMemBufferRead(hQueue, hBuffer, blockingRead, offset, size, pDst, @@ -4276,10 +4731,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferRead( "urEnqueueMemBufferRead", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_READ, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_READ, ¶ms); + logger.info(" <--- urEnqueueMemBufferRead({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -4304,7 +4762,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWrite( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnMemBufferWrite = getContext()->urDdiTable.Enqueue.pfnMemBufferWrite; @@ -4320,7 +4779,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWrite( getContext()->notify_begin(UR_FUNCTION_ENQUEUE_MEM_BUFFER_WRITE, "urEnqueueMemBufferWrite", ¶ms); - getContext()->logger.info("---> urEnqueueMemBufferWrite"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueMemBufferWrite\n"); ur_result_t result = pfnMemBufferWrite(hQueue, hBuffer, blockingWrite, offset, size, pSrc, @@ -4330,10 +4790,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWrite( "urEnqueueMemBufferWrite", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_WRITE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_WRITE, ¶ms); + logger.info(" <--- urEnqueueMemBufferWrite({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -4368,7 +4831,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferReadRect( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnMemBufferReadRect = getContext()->urDdiTable.Enqueue.pfnMemBufferReadRect; @@ -4395,7 +4859,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferReadRect( getContext()->notify_begin(UR_FUNCTION_ENQUEUE_MEM_BUFFER_READ_RECT, "urEnqueueMemBufferReadRect", ¶ms); - getContext()->logger.info("---> urEnqueueMemBufferReadRect"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueMemBufferReadRect\n"); ur_result_t result = pfnMemBufferReadRect( hQueue, hBuffer, blockingRead, bufferOrigin, hostOrigin, region, @@ -4406,10 +4871,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferReadRect( "urEnqueueMemBufferReadRect", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_READ_RECT, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_READ_RECT, ¶ms); + logger.info(" <--- urEnqueueMemBufferReadRect({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -4447,7 +4915,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnMemBufferWriteRect = getContext()->urDdiTable.Enqueue.pfnMemBufferWriteRect; @@ -4474,7 +4943,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( getContext()->notify_begin(UR_FUNCTION_ENQUEUE_MEM_BUFFER_WRITE_RECT, "urEnqueueMemBufferWriteRect", ¶ms); - getContext()->logger.info("---> urEnqueueMemBufferWriteRect"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueMemBufferWriteRect\n"); ur_result_t result = pfnMemBufferWriteRect( hQueue, hBuffer, blockingWrite, bufferOrigin, hostOrigin, region, @@ -4485,10 +4955,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( "urEnqueueMemBufferWriteRect", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_WRITE_RECT, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_WRITE_RECT, ¶ms); + logger.info(" <--- urEnqueueMemBufferWriteRect({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -4512,7 +4985,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopy( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnMemBufferCopy = getContext()->urDdiTable.Enqueue.pfnMemBufferCopy; @@ -4526,7 +5000,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopy( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_ENQUEUE_MEM_BUFFER_COPY, "urEnqueueMemBufferCopy", ¶ms); - getContext()->logger.info("---> urEnqueueMemBufferCopy"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueMemBufferCopy\n"); ur_result_t result = pfnMemBufferCopy(hQueue, hBufferSrc, hBufferDst, srcOffset, dstOffset, @@ -4536,10 +5011,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopy( "urEnqueueMemBufferCopy", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_COPY, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_COPY, ¶ms); + logger.info(" <--- urEnqueueMemBufferCopy({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -4572,7 +5050,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnMemBufferCopyRect = getContext()->urDdiTable.Enqueue.pfnMemBufferCopyRect; @@ -4590,7 +5069,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( getContext()->notify_begin(UR_FUNCTION_ENQUEUE_MEM_BUFFER_COPY_RECT, "urEnqueueMemBufferCopyRect", ¶ms); - getContext()->logger.info("---> urEnqueueMemBufferCopyRect"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueMemBufferCopyRect\n"); ur_result_t result = pfnMemBufferCopyRect( hQueue, hBufferSrc, hBufferDst, srcOrigin, dstOrigin, region, @@ -4601,10 +5081,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( "urEnqueueMemBufferCopyRect", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_COPY_RECT, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_COPY_RECT, ¶ms); + logger.info(" <--- urEnqueueMemBufferCopyRect({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -4627,7 +5110,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferFill( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnMemBufferFill = getContext()->urDdiTable.Enqueue.pfnMemBufferFill; @@ -4647,7 +5131,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferFill( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_ENQUEUE_MEM_BUFFER_FILL, "urEnqueueMemBufferFill", ¶ms); - getContext()->logger.info("---> urEnqueueMemBufferFill"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueMemBufferFill\n"); ur_result_t result = pfnMemBufferFill(hQueue, hBuffer, pPattern, patternSize, offset, size, @@ -4657,10 +5142,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferFill( "urEnqueueMemBufferFill", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_FILL, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_FILL, ¶ms); + logger.info(" <--- urEnqueueMemBufferFill({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -4688,7 +5176,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageRead( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnMemImageRead = getContext()->urDdiTable.Enqueue.pfnMemImageRead; @@ -4704,7 +5193,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageRead( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_ENQUEUE_MEM_IMAGE_READ, "urEnqueueMemImageRead", ¶ms); - getContext()->logger.info("---> urEnqueueMemImageRead"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueMemImageRead\n"); ur_result_t result = pfnMemImageRead( hQueue, hImage, blockingRead, origin, region, rowPitch, slicePitch, @@ -4714,10 +5204,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageRead( "urEnqueueMemImageRead", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_MEM_IMAGE_READ, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_MEM_IMAGE_READ, ¶ms); + logger.info(" <--- urEnqueueMemImageRead({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -4746,7 +5239,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageWrite( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnMemImageWrite = getContext()->urDdiTable.Enqueue.pfnMemImageWrite; @@ -4762,7 +5256,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageWrite( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_ENQUEUE_MEM_IMAGE_WRITE, "urEnqueueMemImageWrite", ¶ms); - getContext()->logger.info("---> urEnqueueMemImageWrite"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueMemImageWrite\n"); ur_result_t result = pfnMemImageWrite( hQueue, hImage, blockingWrite, origin, region, rowPitch, slicePitch, @@ -4772,10 +5267,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageWrite( "urEnqueueMemImageWrite", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_MEM_IMAGE_WRITE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_MEM_IMAGE_WRITE, ¶ms); + logger.info(" <--- urEnqueueMemImageWrite({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -4805,7 +5303,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageCopy( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnMemImageCopy = getContext()->urDdiTable.Enqueue.pfnMemImageCopy; @@ -4819,7 +5318,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageCopy( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_ENQUEUE_MEM_IMAGE_COPY, "urEnqueueMemImageCopy", ¶ms); - getContext()->logger.info("---> urEnqueueMemImageCopy"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueMemImageCopy\n"); ur_result_t result = pfnMemImageCopy(hQueue, hImageSrc, hImageDst, srcOrigin, dstOrigin, @@ -4829,10 +5329,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageCopy( "urEnqueueMemImageCopy", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_MEM_IMAGE_COPY, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_MEM_IMAGE_COPY, ¶ms); + logger.info(" <--- urEnqueueMemImageCopy({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -4855,7 +5358,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferMap( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent, ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. void **ppRetMap ///< [out] return mapped pointer. TODO: move it before ///< numEventsInWaitList? ) { @@ -4872,7 +5376,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferMap( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_ENQUEUE_MEM_BUFFER_MAP, "urEnqueueMemBufferMap", ¶ms); - getContext()->logger.info("---> urEnqueueMemBufferMap"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueMemBufferMap\n"); ur_result_t result = pfnMemBufferMap(hQueue, hBuffer, blockingMap, mapFlags, offset, size, numEventsInWaitList, @@ -4882,10 +5387,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferMap( "urEnqueueMemBufferMap", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_MAP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_MAP, ¶ms); + logger.info(" <--- urEnqueueMemBufferMap({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -4905,7 +5413,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemUnmap( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnMemUnmap = getContext()->urDdiTable.Enqueue.pfnMemUnmap; @@ -4919,7 +5428,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemUnmap( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_ENQUEUE_MEM_UNMAP, "urEnqueueMemUnmap", ¶ms); - getContext()->logger.info("---> urEnqueueMemUnmap"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueMemUnmap\n"); ur_result_t result = pfnMemUnmap(hQueue, hMem, pMappedPtr, numEventsInWaitList, @@ -4928,10 +5438,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemUnmap( getContext()->notify_end(UR_FUNCTION_ENQUEUE_MEM_UNMAP, "urEnqueueMemUnmap", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_MEM_UNMAP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_MEM_UNMAP, + ¶ms); + logger.info(" <--- urEnqueueMemUnmap({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -4956,7 +5469,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnUSMFill = getContext()->urDdiTable.Enqueue.pfnUSMFill; @@ -4971,7 +5485,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_ENQUEUE_USM_FILL, "urEnqueueUSMFill", ¶ms); - getContext()->logger.info("---> urEnqueueUSMFill"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueUSMFill\n"); ur_result_t result = pfnUSMFill(hQueue, pMem, patternSize, pPattern, size, @@ -4980,10 +5495,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill( getContext()->notify_end(UR_FUNCTION_ENQUEUE_USM_FILL, "urEnqueueUSMFill", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_USM_FILL, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_USM_FILL, + ¶ms); + logger.info(" <--- urEnqueueUSMFill({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -5006,7 +5524,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnUSMMemcpy = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy; @@ -5020,7 +5539,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_ENQUEUE_USM_MEMCPY, "urEnqueueUSMMemcpy", ¶ms); - getContext()->logger.info("---> urEnqueueUSMMemcpy"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueUSMMemcpy\n"); ur_result_t result = pfnUSMMemcpy(hQueue, blocking, pDst, pSrc, size, numEventsInWaitList, @@ -5029,10 +5549,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy( getContext()->notify_end(UR_FUNCTION_ENQUEUE_USM_MEMCPY, "urEnqueueUSMMemcpy", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_USM_MEMCPY, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_USM_MEMCPY, ¶ms); + logger.info(" <--- urEnqueueUSMMemcpy({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -5053,7 +5576,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMPrefetch( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnUSMPrefetch = getContext()->urDdiTable.Enqueue.pfnUSMPrefetch; @@ -5067,7 +5591,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMPrefetch( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_ENQUEUE_USM_PREFETCH, "urEnqueueUSMPrefetch", ¶ms); - getContext()->logger.info("---> urEnqueueUSMPrefetch"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueUSMPrefetch\n"); ur_result_t result = pfnUSMPrefetch(hQueue, pMem, size, flags, numEventsInWaitList, @@ -5077,10 +5602,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMPrefetch( "urEnqueueUSMPrefetch", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_USM_PREFETCH, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_USM_PREFETCH, ¶ms); + logger.info(" <--- urEnqueueUSMPrefetch({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -5108,17 +5636,21 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMAdvise( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_ENQUEUE_USM_ADVISE, "urEnqueueUSMAdvise", ¶ms); - getContext()->logger.info("---> urEnqueueUSMAdvise"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueUSMAdvise\n"); ur_result_t result = pfnUSMAdvise(hQueue, pMem, size, advice, phEvent); getContext()->notify_end(UR_FUNCTION_ENQUEUE_USM_ADVISE, "urEnqueueUSMAdvise", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_USM_ADVISE, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_USM_ADVISE, ¶ms); + logger.info(" <--- urEnqueueUSMAdvise({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -5144,11 +5676,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill2D( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { auto pfnUSMFill2D = getContext()->urDdiTable.Enqueue.pfnUSMFill2D; @@ -5163,7 +5695,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill2D( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_ENQUEUE_USM_FILL_2D, "urEnqueueUSMFill2D", ¶ms); - getContext()->logger.info("---> urEnqueueUSMFill2D"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueUSMFill2D\n"); ur_result_t result = pfnUSMFill2D(hQueue, pMem, pitch, patternSize, pPattern, width, height, @@ -5172,10 +5705,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill2D( getContext()->notify_end(UR_FUNCTION_ENQUEUE_USM_FILL_2D, "urEnqueueUSMFill2D", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_USM_FILL_2D, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_USM_FILL_2D, ¶ms); + logger.info(" <--- urEnqueueUSMFill2D({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -5200,11 +5736,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { auto pfnUSMMemcpy2D = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy2D; @@ -5220,7 +5756,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_ENQUEUE_USM_MEMCPY_2D, "urEnqueueUSMMemcpy2D", ¶ms); - getContext()->logger.info("---> urEnqueueUSMMemcpy2D"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueUSMMemcpy2D\n"); ur_result_t result = pfnUSMMemcpy2D(hQueue, blocking, pDst, dstPitch, pSrc, srcPitch, width, @@ -5230,10 +5767,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( "urEnqueueUSMMemcpy2D", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_USM_MEMCPY_2D, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_USM_MEMCPY_2D, ¶ms); + logger.info(" <--- urEnqueueUSMMemcpy2D({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -5255,11 +5795,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { auto pfnDeviceGlobalVariableWrite = getContext()->urDdiTable.Enqueue.pfnDeviceGlobalVariableWrite; @@ -5276,7 +5816,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( UR_FUNCTION_ENQUEUE_DEVICE_GLOBAL_VARIABLE_WRITE, "urEnqueueDeviceGlobalVariableWrite", ¶ms); - getContext()->logger.info("---> urEnqueueDeviceGlobalVariableWrite"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueDeviceGlobalVariableWrite\n"); ur_result_t result = pfnDeviceGlobalVariableWrite( hQueue, hProgram, name, blockingWrite, count, offset, pSrc, @@ -5286,10 +5827,14 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( "urEnqueueDeviceGlobalVariableWrite", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_DEVICE_GLOBAL_VARIABLE_WRITE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_DEVICE_GLOBAL_VARIABLE_WRITE, + ¶ms); + logger.info(" <--- urEnqueueDeviceGlobalVariableWrite({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -5311,11 +5856,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { auto pfnDeviceGlobalVariableRead = getContext()->urDdiTable.Enqueue.pfnDeviceGlobalVariableRead; @@ -5332,7 +5877,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( UR_FUNCTION_ENQUEUE_DEVICE_GLOBAL_VARIABLE_READ, "urEnqueueDeviceGlobalVariableRead", ¶ms); - getContext()->logger.info("---> urEnqueueDeviceGlobalVariableRead"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueDeviceGlobalVariableRead\n"); ur_result_t result = pfnDeviceGlobalVariableRead( hQueue, hProgram, name, blockingRead, count, offset, pDst, @@ -5342,10 +5888,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( "urEnqueueDeviceGlobalVariableRead", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_DEVICE_GLOBAL_VARIABLE_READ, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_DEVICE_GLOBAL_VARIABLE_READ, ¶ms); + logger.info(" <--- urEnqueueDeviceGlobalVariableRead({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -5373,9 +5922,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueReadHostPipe( ///< events that must be complete before the host pipe read. ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * - phEvent ///< [out][optional] returns an event object that identifies this read - ///< command + phEvent ///< [out][optional] returns an event object that identifies this read command ///< and can be used to query or queue a wait for this command to complete. + ///< If phEventWaitList and phEvent are not NULL, phEvent must not refer to + ///< an element of the phEventWaitList array. ) { auto pfnReadHostPipe = getContext()->urDdiTable.Enqueue.pfnReadHostPipe; @@ -5390,7 +5940,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueReadHostPipe( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_ENQUEUE_READ_HOST_PIPE, "urEnqueueReadHostPipe", ¶ms); - getContext()->logger.info("---> urEnqueueReadHostPipe"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueReadHostPipe\n"); ur_result_t result = pfnReadHostPipe(hQueue, hProgram, pipe_symbol, blocking, pDst, size, @@ -5400,10 +5951,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueReadHostPipe( "urEnqueueReadHostPipe", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_READ_HOST_PIPE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_READ_HOST_PIPE, ¶ms); + logger.info(" <--- urEnqueueReadHostPipe({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -5434,6 +5988,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueWriteHostPipe( ur_event_handle_t * phEvent ///< [out][optional] returns an event object that identifies this write command ///< and can be used to query or queue a wait for this command to complete. + ///< If phEventWaitList and phEvent are not NULL, phEvent must not refer to + ///< an element of the phEventWaitList array. ) { auto pfnWriteHostPipe = getContext()->urDdiTable.Enqueue.pfnWriteHostPipe; @@ -5448,7 +6004,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueWriteHostPipe( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_ENQUEUE_WRITE_HOST_PIPE, "urEnqueueWriteHostPipe", ¶ms); - getContext()->logger.info("---> urEnqueueWriteHostPipe"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueWriteHostPipe\n"); ur_result_t result = pfnWriteHostPipe(hQueue, hProgram, pipe_symbol, blocking, pSrc, size, @@ -5458,10 +6015,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueWriteHostPipe( "urEnqueueWriteHostPipe", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_WRITE_HOST_PIPE, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_WRITE_HOST_PIPE, ¶ms); + logger.info(" <--- urEnqueueWriteHostPipe({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -5496,7 +6056,8 @@ __urdlllocal ur_result_t UR_APICALL urUSMPitchedAllocExp( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_USM_PITCHED_ALLOC_EXP, "urUSMPitchedAllocExp", ¶ms); - getContext()->logger.info("---> urUSMPitchedAllocExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urUSMPitchedAllocExp\n"); ur_result_t result = pfnPitchedAllocExp(hContext, hDevice, pUSMDesc, pool, widthInBytes, @@ -5506,10 +6067,13 @@ __urdlllocal ur_result_t UR_APICALL urUSMPitchedAllocExp( "urUSMPitchedAllocExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_PITCHED_ALLOC_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_USM_PITCHED_ALLOC_EXP, ¶ms); + logger.info(" <--- urUSMPitchedAllocExp({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -5537,8 +6101,8 @@ urBindlessImagesUnsampledImageHandleDestroyExp( UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_HANDLE_DESTROY_EXP, "urBindlessImagesUnsampledImageHandleDestroyExp", ¶ms); - getContext()->logger.info( - "---> urBindlessImagesUnsampledImageHandleDestroyExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urBindlessImagesUnsampledImageHandleDestroyExp\n"); ur_result_t result = pfnUnsampledImageHandleDestroyExp(hContext, hDevice, hImage); @@ -5548,12 +6112,17 @@ urBindlessImagesUnsampledImageHandleDestroyExp( "urBindlessImagesUnsampledImageHandleDestroyExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, - UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_HANDLE_DESTROY_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, + UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_HANDLE_DESTROY_EXP, + ¶ms); + logger.info( + " <--- urBindlessImagesUnsampledImageHandleDestroyExp({}) -> " + "{};\n", + args_str.str(), result); + } return result; } @@ -5581,8 +6150,8 @@ urBindlessImagesSampledImageHandleDestroyExp( UR_FUNCTION_BINDLESS_IMAGES_SAMPLED_IMAGE_HANDLE_DESTROY_EXP, "urBindlessImagesSampledImageHandleDestroyExp", ¶ms); - getContext()->logger.info( - "---> urBindlessImagesSampledImageHandleDestroyExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urBindlessImagesSampledImageHandleDestroyExp\n"); ur_result_t result = pfnSampledImageHandleDestroyExp(hContext, hDevice, hImage); @@ -5592,11 +6161,16 @@ urBindlessImagesSampledImageHandleDestroyExp( "urBindlessImagesSampledImageHandleDestroyExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_BINDLESS_IMAGES_SAMPLED_IMAGE_HANDLE_DESTROY_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, + UR_FUNCTION_BINDLESS_IMAGES_SAMPLED_IMAGE_HANDLE_DESTROY_EXP, + ¶ms); + logger.info( + " <--- urBindlessImagesSampledImageHandleDestroyExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -5625,7 +6199,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageAllocateExp( UR_FUNCTION_BINDLESS_IMAGES_IMAGE_ALLOCATE_EXP, "urBindlessImagesImageAllocateExp", ¶ms); - getContext()->logger.info("---> urBindlessImagesImageAllocateExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urBindlessImagesImageAllocateExp\n"); ur_result_t result = pfnImageAllocateExp(hContext, hDevice, pImageFormat, pImageDesc, phImageMem); @@ -5634,10 +6209,13 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageAllocateExp( "urBindlessImagesImageAllocateExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_BINDLESS_IMAGES_IMAGE_ALLOCATE_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_IMAGE_ALLOCATE_EXP, ¶ms); + logger.info(" <--- urBindlessImagesImageAllocateExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -5663,7 +6241,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageFreeExp( getContext()->notify_begin(UR_FUNCTION_BINDLESS_IMAGES_IMAGE_FREE_EXP, "urBindlessImagesImageFreeExp", ¶ms); - getContext()->logger.info("---> urBindlessImagesImageFreeExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urBindlessImagesImageFreeExp\n"); ur_result_t result = pfnImageFreeExp(hContext, hDevice, hImageMem); @@ -5671,10 +6250,13 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageFreeExp( "urBindlessImagesImageFreeExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_BINDLESS_IMAGES_IMAGE_FREE_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_IMAGE_FREE_EXP, ¶ms); + logger.info(" <--- urBindlessImagesImageFreeExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -5705,7 +6287,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_CREATE_EXP, "urBindlessImagesUnsampledImageCreateExp", ¶ms); - getContext()->logger.info("---> urBindlessImagesUnsampledImageCreateExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urBindlessImagesUnsampledImageCreateExp\n"); ur_result_t result = pfnUnsampledImageCreateExp( hContext, hDevice, hImageMem, pImageFormat, pImageDesc, phImage); @@ -5714,11 +6297,15 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_CREATE_EXP, "urBindlessImagesUnsampledImageCreateExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_CREATE_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_CREATE_EXP, + ¶ms); + logger.info( + " <--- urBindlessImagesUnsampledImageCreateExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -5751,7 +6338,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( UR_FUNCTION_BINDLESS_IMAGES_SAMPLED_IMAGE_CREATE_EXP, "urBindlessImagesSampledImageCreateExp", ¶ms); - getContext()->logger.info("---> urBindlessImagesSampledImageCreateExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urBindlessImagesSampledImageCreateExp\n"); ur_result_t result = pfnSampledImageCreateExp(hContext, hDevice, hImageMem, pImageFormat, @@ -5761,11 +6349,15 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( UR_FUNCTION_BINDLESS_IMAGES_SAMPLED_IMAGE_CREATE_EXP, "urBindlessImagesSampledImageCreateExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_BINDLESS_IMAGES_SAMPLED_IMAGE_CREATE_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_SAMPLED_IMAGE_CREATE_EXP, + ¶ms); + logger.info( + " <--- urBindlessImagesSampledImageCreateExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -5796,7 +6388,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageCopyExp( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnImageCopyExp = getContext()->urDdiTable.BindlessImagesExp.pfnImageCopyExp; @@ -5821,7 +6414,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageCopyExp( getContext()->notify_begin(UR_FUNCTION_BINDLESS_IMAGES_IMAGE_COPY_EXP, "urBindlessImagesImageCopyExp", ¶ms); - getContext()->logger.info("---> urBindlessImagesImageCopyExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urBindlessImagesImageCopyExp\n"); ur_result_t result = pfnImageCopyExp( hQueue, pSrc, pDst, pSrcImageDesc, pDstImageDesc, pSrcImageFormat, @@ -5832,10 +6426,13 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageCopyExp( "urBindlessImagesImageCopyExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_BINDLESS_IMAGES_IMAGE_COPY_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_IMAGE_COPY_EXP, ¶ms); + logger.info(" <--- urBindlessImagesImageCopyExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -5863,7 +6460,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageGetInfoExp( UR_FUNCTION_BINDLESS_IMAGES_IMAGE_GET_INFO_EXP, "urBindlessImagesImageGetInfoExp", ¶ms); - getContext()->logger.info("---> urBindlessImagesImageGetInfoExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urBindlessImagesImageGetInfoExp\n"); ur_result_t result = pfnImageGetInfoExp(hContext, hImageMem, propName, pPropValue, pPropSizeRet); @@ -5872,10 +6470,13 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageGetInfoExp( "urBindlessImagesImageGetInfoExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_BINDLESS_IMAGES_IMAGE_GET_INFO_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_IMAGE_GET_INFO_EXP, ¶ms); + logger.info(" <--- urBindlessImagesImageGetInfoExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -5904,7 +6505,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMipmapGetLevelExp( UR_FUNCTION_BINDLESS_IMAGES_MIPMAP_GET_LEVEL_EXP, "urBindlessImagesMipmapGetLevelExp", ¶ms); - getContext()->logger.info("---> urBindlessImagesMipmapGetLevelExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urBindlessImagesMipmapGetLevelExp\n"); ur_result_t result = pfnMipmapGetLevelExp(hContext, hDevice, hImageMem, mipmapLevel, phImageMem); @@ -5913,10 +6515,14 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMipmapGetLevelExp( "urBindlessImagesMipmapGetLevelExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_BINDLESS_IMAGES_MIPMAP_GET_LEVEL_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_MIPMAP_GET_LEVEL_EXP, + ¶ms); + logger.info(" <--- urBindlessImagesMipmapGetLevelExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -5942,7 +6548,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMipmapFreeExp( getContext()->notify_begin(UR_FUNCTION_BINDLESS_IMAGES_MIPMAP_FREE_EXP, "urBindlessImagesMipmapFreeExp", ¶ms); - getContext()->logger.info("---> urBindlessImagesMipmapFreeExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urBindlessImagesMipmapFreeExp\n"); ur_result_t result = pfnMipmapFreeExp(hContext, hDevice, hMem); @@ -5950,10 +6557,13 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMipmapFreeExp( "urBindlessImagesMipmapFreeExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_BINDLESS_IMAGES_MIPMAP_FREE_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_MIPMAP_FREE_EXP, ¶ms); + logger.info(" <--- urBindlessImagesMipmapFreeExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -5966,10 +6576,10 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( size_t size, ///< [in] size of the external memory ur_exp_external_mem_type_t memHandleType, ///< [in] type of external memory handle - ur_exp_interop_mem_desc_t - *pInteropMemDesc, ///< [in] the interop memory descriptor - ur_exp_interop_mem_handle_t - *phInteropMem ///< [out] interop memory handle to the external memory + ur_exp_external_mem_desc_t + *pExternalMemDesc, ///< [in] the external memory descriptor + ur_exp_external_mem_handle_t + *phExternalMem ///< [out] external memory handle to the external memory ) { auto pfnImportExternalMemoryExp = getContext()->urDdiTable.BindlessImagesExp.pfnImportExternalMemoryExp; @@ -5979,26 +6589,32 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( } ur_bindless_images_import_external_memory_exp_params_t params = { - &hContext, &hDevice, &size, - &memHandleType, &pInteropMemDesc, &phInteropMem}; + &hContext, &hDevice, &size, + &memHandleType, &pExternalMemDesc, &phExternalMem}; uint64_t instance = getContext()->notify_begin( UR_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_MEMORY_EXP, "urBindlessImagesImportExternalMemoryExp", ¶ms); - getContext()->logger.info("---> urBindlessImagesImportExternalMemoryExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urBindlessImagesImportExternalMemoryExp\n"); - ur_result_t result = pfnImportExternalMemoryExp( - hContext, hDevice, size, memHandleType, pInteropMemDesc, phInteropMem); + ur_result_t result = + pfnImportExternalMemoryExp(hContext, hDevice, size, memHandleType, + pExternalMemDesc, phExternalMem); getContext()->notify_end( UR_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_MEMORY_EXP, "urBindlessImagesImportExternalMemoryExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_MEMORY_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_MEMORY_EXP, + ¶ms); + logger.info( + " <--- urBindlessImagesImportExternalMemoryExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -6011,8 +6627,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( const ur_image_format_t *pImageFormat, ///< [in] pointer to image format specification const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description - ur_exp_interop_mem_handle_t - hInteropMem, ///< [in] interop memory handle to the external memory + ur_exp_external_mem_handle_t + hExternalMem, ///< [in] external memory handle to the external memory ur_exp_image_mem_native_handle_t * phImageMem ///< [out] image memory handle to the externally allocated memory ) { @@ -6024,62 +6640,124 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( } ur_bindless_images_map_external_array_exp_params_t params = { - &hContext, &hDevice, &pImageFormat, - &pImageDesc, &hInteropMem, &phImageMem}; + &hContext, &hDevice, &pImageFormat, + &pImageDesc, &hExternalMem, &phImageMem}; uint64_t instance = getContext()->notify_begin( UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_ARRAY_EXP, "urBindlessImagesMapExternalArrayExp", ¶ms); - getContext()->logger.info("---> urBindlessImagesMapExternalArrayExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urBindlessImagesMapExternalArrayExp\n"); ur_result_t result = pfnMapExternalArrayExp( - hContext, hDevice, pImageFormat, pImageDesc, hInteropMem, phImageMem); + hContext, hDevice, pImageFormat, pImageDesc, hExternalMem, phImageMem); getContext()->notify_end(UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_ARRAY_EXP, "urBindlessImagesMapExternalArrayExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_ARRAY_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_ARRAY_EXP, + ¶ms); + logger.info(" <--- urBindlessImagesMapExternalArrayExp({}) -> {};\n", + args_str.str(), result); + } return result; } /////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urBindlessImagesReleaseInteropExp -__urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( +/// @brief Intercept function for urBindlessImagesMapExternalLinearMemoryExp +__urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp( ur_context_handle_t hContext, ///< [in] handle of the context object ur_device_handle_t hDevice, ///< [in] handle of the device object - ur_exp_interop_mem_handle_t - hInteropMem ///< [in][release] handle of interop memory to be destroyed + uint64_t offset, ///< [in] offset into memory region to map + uint64_t size, ///< [in] size of memory region to map + ur_exp_external_mem_handle_t + hExternalMem, ///< [in] external memory handle to the external memory + void **ppRetMem ///< [out] pointer of the externally allocated memory ) { - auto pfnReleaseInteropExp = - getContext()->urDdiTable.BindlessImagesExp.pfnReleaseInteropExp; + auto pfnMapExternalLinearMemoryExp = + getContext() + ->urDdiTable.BindlessImagesExp.pfnMapExternalLinearMemoryExp; - if (nullptr == pfnReleaseInteropExp) { + if (nullptr == pfnMapExternalLinearMemoryExp) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } - ur_bindless_images_release_interop_exp_params_t params = { - &hContext, &hDevice, &hInteropMem}; + ur_bindless_images_map_external_linear_memory_exp_params_t params = { + &hContext, &hDevice, &offset, &size, &hExternalMem, &ppRetMem}; uint64_t instance = getContext()->notify_begin( - UR_FUNCTION_BINDLESS_IMAGES_RELEASE_INTEROP_EXP, - "urBindlessImagesReleaseInteropExp", ¶ms); + UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP, + "urBindlessImagesMapExternalLinearMemoryExp", ¶ms); - getContext()->logger.info("---> urBindlessImagesReleaseInteropExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urBindlessImagesMapExternalLinearMemoryExp\n"); - ur_result_t result = pfnReleaseInteropExp(hContext, hDevice, hInteropMem); + ur_result_t result = pfnMapExternalLinearMemoryExp( + hContext, hDevice, offset, size, hExternalMem, ppRetMem); - getContext()->notify_end(UR_FUNCTION_BINDLESS_IMAGES_RELEASE_INTEROP_EXP, - "urBindlessImagesReleaseInteropExp", ¶ms, - &result, instance); + getContext()->notify_end( + UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP, + "urBindlessImagesMapExternalLinearMemoryExp", ¶ms, &result, + instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_BINDLESS_IMAGES_RELEASE_INTEROP_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, + UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_LINEAR_MEMORY_EXP, + ¶ms); + logger.info( + " <--- urBindlessImagesMapExternalLinearMemoryExp({}) -> {};\n", + args_str.str(), result); + } + + return result; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urBindlessImagesReleaseExternalMemoryExp +__urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp( + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_device_handle_t hDevice, ///< [in] handle of the device object + ur_exp_external_mem_handle_t + hExternalMem ///< [in][release] handle of external memory to be destroyed +) { + auto pfnReleaseExternalMemoryExp = + getContext()->urDdiTable.BindlessImagesExp.pfnReleaseExternalMemoryExp; + + if (nullptr == pfnReleaseExternalMemoryExp) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + ur_bindless_images_release_external_memory_exp_params_t params = { + &hContext, &hDevice, &hExternalMem}; + uint64_t instance = getContext()->notify_begin( + UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP, + "urBindlessImagesReleaseExternalMemoryExp", ¶ms); + + auto &logger = getContext()->logger; + logger.info(" ---> urBindlessImagesReleaseExternalMemoryExp\n"); + + ur_result_t result = + pfnReleaseExternalMemoryExp(hContext, hDevice, hExternalMem); + + getContext()->notify_end( + UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP, + "urBindlessImagesReleaseExternalMemoryExp", ¶ms, &result, instance); + + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_MEMORY_EXP, + ¶ms); + logger.info( + " <--- urBindlessImagesReleaseExternalMemoryExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -6091,10 +6769,10 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( ur_device_handle_t hDevice, ///< [in] handle of the device object ur_exp_external_semaphore_type_t semHandleType, ///< [in] type of external memory handle - ur_exp_interop_semaphore_desc_t - *pInteropSemaphoreDesc, ///< [in] the interop semaphore descriptor - ur_exp_interop_semaphore_handle_t * - phInteropSemaphore ///< [out] interop semaphore handle to the external semaphore + ur_exp_external_semaphore_desc_t + *pExternalSemaphoreDesc, ///< [in] the external semaphore descriptor + ur_exp_external_semaphore_handle_t * + phExternalSemaphore ///< [out] external semaphore handle to the external semaphore ) { auto pfnImportExternalSemaphoreExp = getContext() @@ -6105,29 +6783,33 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( } ur_bindless_images_import_external_semaphore_exp_params_t params = { - &hContext, &hDevice, &semHandleType, &pInteropSemaphoreDesc, - &phInteropSemaphore}; + &hContext, &hDevice, &semHandleType, &pExternalSemaphoreDesc, + &phExternalSemaphore}; uint64_t instance = getContext()->notify_begin( UR_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_SEMAPHORE_EXP, "urBindlessImagesImportExternalSemaphoreExp", ¶ms); - getContext()->logger.info( - "---> urBindlessImagesImportExternalSemaphoreExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urBindlessImagesImportExternalSemaphoreExp\n"); ur_result_t result = pfnImportExternalSemaphoreExp( - hContext, hDevice, semHandleType, pInteropSemaphoreDesc, - phInteropSemaphore); + hContext, hDevice, semHandleType, pExternalSemaphoreDesc, + phExternalSemaphore); getContext()->notify_end( UR_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_SEMAPHORE_EXP, "urBindlessImagesImportExternalSemaphoreExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_SEMAPHORE_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_SEMAPHORE_EXP, + ¶ms); + logger.info( + " <--- urBindlessImagesImportExternalSemaphoreExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -6137,8 +6819,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( ur_context_handle_t hContext, ///< [in] handle of the context object ur_device_handle_t hDevice, ///< [in] handle of the device object - ur_exp_interop_semaphore_handle_t - hInteropSemaphore ///< [in][release] handle of interop semaphore to be destroyed + ur_exp_external_semaphore_handle_t + hExternalSemaphore ///< [in][release] handle of external semaphore to be destroyed ) { auto pfnReleaseExternalSemaphoreExp = getContext() @@ -6149,27 +6831,32 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( } ur_bindless_images_release_external_semaphore_exp_params_t params = { - &hContext, &hDevice, &hInteropSemaphore}; + &hContext, &hDevice, &hExternalSemaphore}; uint64_t instance = getContext()->notify_begin( UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_SEMAPHORE_EXP, "urBindlessImagesReleaseExternalSemaphoreExp", ¶ms); - getContext()->logger.info( - "---> urBindlessImagesReleaseExternalSemaphoreExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urBindlessImagesReleaseExternalSemaphoreExp\n"); ur_result_t result = - pfnReleaseExternalSemaphoreExp(hContext, hDevice, hInteropSemaphore); + pfnReleaseExternalSemaphoreExp(hContext, hDevice, hExternalSemaphore); getContext()->notify_end( UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_SEMAPHORE_EXP, "urBindlessImagesReleaseExternalSemaphoreExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_SEMAPHORE_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, + UR_FUNCTION_BINDLESS_IMAGES_RELEASE_EXTERNAL_SEMAPHORE_EXP, + ¶ms); + logger.info( + " <--- urBindlessImagesReleaseExternalSemaphoreExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -6178,8 +6865,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( /// @brief Intercept function for urBindlessImagesWaitExternalSemaphoreExp __urdlllocal ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( ur_queue_handle_t hQueue, ///< [in] handle of the queue object - ur_exp_interop_semaphore_handle_t - hSemaphore, ///< [in] interop semaphore handle + ur_exp_external_semaphore_handle_t + hSemaphore, ///< [in] external semaphore handle bool hasWaitValue, ///< [in] indicates whether the samephore is capable and should wait on a ///< certain value. @@ -6195,7 +6882,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnWaitExternalSemaphoreExp = getContext()->urDdiTable.BindlessImagesExp.pfnWaitExternalSemaphoreExp; @@ -6212,7 +6900,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( UR_FUNCTION_BINDLESS_IMAGES_WAIT_EXTERNAL_SEMAPHORE_EXP, "urBindlessImagesWaitExternalSemaphoreExp", ¶ms); - getContext()->logger.info("---> urBindlessImagesWaitExternalSemaphoreExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urBindlessImagesWaitExternalSemaphoreExp\n"); ur_result_t result = pfnWaitExternalSemaphoreExp( hQueue, hSemaphore, hasWaitValue, waitValue, numEventsInWaitList, @@ -6222,11 +6911,15 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( UR_FUNCTION_BINDLESS_IMAGES_WAIT_EXTERNAL_SEMAPHORE_EXP, "urBindlessImagesWaitExternalSemaphoreExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_BINDLESS_IMAGES_WAIT_EXTERNAL_SEMAPHORE_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_WAIT_EXTERNAL_SEMAPHORE_EXP, + ¶ms); + logger.info( + " <--- urBindlessImagesWaitExternalSemaphoreExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -6235,8 +6928,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( /// @brief Intercept function for urBindlessImagesSignalExternalSemaphoreExp __urdlllocal ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( ur_queue_handle_t hQueue, ///< [in] handle of the queue object - ur_exp_interop_semaphore_handle_t - hSemaphore, ///< [in] interop semaphore handle + ur_exp_external_semaphore_handle_t + hSemaphore, ///< [in] external semaphore handle bool hasSignalValue, ///< [in] indicates whether the samephore is capable and should signal on a ///< certain value. @@ -6252,7 +6945,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnSignalExternalSemaphoreExp = getContext() @@ -6270,8 +6964,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( UR_FUNCTION_BINDLESS_IMAGES_SIGNAL_EXTERNAL_SEMAPHORE_EXP, "urBindlessImagesSignalExternalSemaphoreExp", ¶ms); - getContext()->logger.info( - "---> urBindlessImagesSignalExternalSemaphoreExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urBindlessImagesSignalExternalSemaphoreExp\n"); ur_result_t result = pfnSignalExternalSemaphoreExp( hQueue, hSemaphore, hasSignalValue, signalValue, numEventsInWaitList, @@ -6282,11 +6976,15 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( "urBindlessImagesSignalExternalSemaphoreExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_BINDLESS_IMAGES_SIGNAL_EXTERNAL_SEMAPHORE_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_SIGNAL_EXTERNAL_SEMAPHORE_EXP, + ¶ms); + logger.info( + " <--- urBindlessImagesSignalExternalSemaphoreExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -6313,7 +7011,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp( getContext()->notify_begin(UR_FUNCTION_COMMAND_BUFFER_CREATE_EXP, "urCommandBufferCreateExp", ¶ms); - getContext()->logger.info("---> urCommandBufferCreateExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferCreateExp\n"); ur_result_t result = pfnCreateExp(hContext, hDevice, pCommandBufferDesc, phCommandBuffer); @@ -6322,10 +7021,13 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp( "urCommandBufferCreateExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_COMMAND_BUFFER_CREATE_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_CREATE_EXP, ¶ms); + logger.info(" <--- urCommandBufferCreateExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -6347,7 +7049,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferRetainExp( getContext()->notify_begin(UR_FUNCTION_COMMAND_BUFFER_RETAIN_EXP, "urCommandBufferRetainExp", ¶ms); - getContext()->logger.info("---> urCommandBufferRetainExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferRetainExp\n"); ur_result_t result = pfnRetainExp(hCommandBuffer); @@ -6355,10 +7058,13 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferRetainExp( "urCommandBufferRetainExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_COMMAND_BUFFER_RETAIN_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_RETAIN_EXP, ¶ms); + logger.info(" <--- urCommandBufferRetainExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -6381,7 +7087,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferReleaseExp( getContext()->notify_begin(UR_FUNCTION_COMMAND_BUFFER_RELEASE_EXP, "urCommandBufferReleaseExp", ¶ms); - getContext()->logger.info("---> urCommandBufferReleaseExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferReleaseExp\n"); ur_result_t result = pfnReleaseExp(hCommandBuffer); @@ -6389,10 +7096,13 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferReleaseExp( "urCommandBufferReleaseExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_COMMAND_BUFFER_RELEASE_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_RELEASE_EXP, ¶ms); + logger.info(" <--- urCommandBufferReleaseExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -6415,7 +7125,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferFinalizeExp( getContext()->notify_begin(UR_FUNCTION_COMMAND_BUFFER_FINALIZE_EXP, "urCommandBufferFinalizeExp", ¶ms); - getContext()->logger.info("---> urCommandBufferFinalizeExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferFinalizeExp\n"); ur_result_t result = pfnFinalizeExp(hCommandBuffer); @@ -6423,10 +7134,13 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferFinalizeExp( "urCommandBufferFinalizeExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_COMMAND_BUFFER_FINALIZE_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_FINALIZE_EXP, ¶ms); + logger.info(" <--- urCommandBufferFinalizeExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -6443,16 +7157,37 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( const size_t * pGlobalWorkSize, ///< [in] Global work size to use when executing kernel. const size_t * - pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. + pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. If this + ///< parameter is nullptr, then a local work size will be generated by the + ///< implementation. + uint32_t + numKernelAlternatives, ///< [in] The number of kernel alternatives provided in + ///< phKernelAlternatives. + ur_kernel_handle_t * + phKernelAlternatives, ///< [in][optional][range(0, numKernelAlternatives)] List of kernel handles + ///< that might be used to update the kernel in this + ///< command after the command-buffer is finalized. The default kernel + ///< `hKernel` is implicitly marked as an alternative. It's + ///< invalid to specify it as part of this list. uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * pSyncPoint, ///< [out][optional] Sync point associated with this command. - ur_exp_command_buffer_command_handle_t - *phCommand ///< [out][optional] Handle to this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t * + phCommand ///< [out][optional] Handle to this command. Only available if the + ///< command-buffer is updatable. ) { auto pfnAppendKernelLaunchExp = getContext()->urDdiTable.CommandBufferExp.pfnAppendKernelLaunchExp; @@ -6468,29 +7203,40 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( &pGlobalWorkOffset, &pGlobalWorkSize, &pLocalWorkSize, + &numKernelAlternatives, + &phKernelAlternatives, &numSyncPointsInWaitList, &pSyncPointWaitList, + &numEventsInWaitList, + &phEventWaitList, &pSyncPoint, + &phEvent, &phCommand}; uint64_t instance = getContext()->notify_begin( UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_EXP, "urCommandBufferAppendKernelLaunchExp", ¶ms); - getContext()->logger.info("---> urCommandBufferAppendKernelLaunchExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferAppendKernelLaunchExp\n"); ur_result_t result = pfnAppendKernelLaunchExp( hCommandBuffer, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint, - phCommand); + pLocalWorkSize, numKernelAlternatives, phKernelAlternatives, + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); getContext()->notify_end( UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_EXP, "urCommandBufferAppendKernelLaunchExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_EXP, + ¶ms); + logger.info(" <--- urCommandBufferAppendKernelLaunchExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -6508,8 +7254,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { auto pfnAppendUSMMemcpyExp = getContext()->urDdiTable.CommandBufferExp.pfnAppendUSMMemcpyExp; @@ -6519,26 +7276,41 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( } ur_command_buffer_append_usm_memcpy_exp_params_t params = { - &hCommandBuffer, &pDst, &pSrc, &size, &numSyncPointsInWaitList, - &pSyncPointWaitList, &pSyncPoint}; + &hCommandBuffer, + &pDst, + &pSrc, + &size, + &numSyncPointsInWaitList, + &pSyncPointWaitList, + &numEventsInWaitList, + &phEventWaitList, + &pSyncPoint, + &phEvent, + &phCommand}; uint64_t instance = getContext()->notify_begin( UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP, "urCommandBufferAppendUSMMemcpyExp", ¶ms); - getContext()->logger.info("---> urCommandBufferAppendUSMMemcpyExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferAppendUSMMemcpyExp\n"); - ur_result_t result = pfnAppendUSMMemcpyExp(hCommandBuffer, pDst, pSrc, size, - numSyncPointsInWaitList, - pSyncPointWaitList, pSyncPoint); + ur_result_t result = pfnAppendUSMMemcpyExp( + hCommandBuffer, pDst, pSrc, size, numSyncPointsInWaitList, + pSyncPointWaitList, numEventsInWaitList, phEventWaitList, pSyncPoint, + phEvent, phCommand); getContext()->notify_end(UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP, "urCommandBufferAppendUSMMemcpyExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP, + ¶ms); + logger.info(" <--- urCommandBufferAppendUSMMemcpyExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -6558,8 +7330,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { auto pfnAppendUSMFillExp = getContext()->urDdiTable.CommandBufferExp.pfnAppendUSMFillExp; @@ -6569,27 +7352,41 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( } ur_command_buffer_append_usm_fill_exp_params_t params = { - &hCommandBuffer, &pMemory, &pPattern, - &patternSize, &size, &numSyncPointsInWaitList, - &pSyncPointWaitList, &pSyncPoint}; + &hCommandBuffer, + &pMemory, + &pPattern, + &patternSize, + &size, + &numSyncPointsInWaitList, + &pSyncPointWaitList, + &numEventsInWaitList, + &phEventWaitList, + &pSyncPoint, + &phEvent, + &phCommand}; uint64_t instance = getContext()->notify_begin( UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_FILL_EXP, "urCommandBufferAppendUSMFillExp", ¶ms); - getContext()->logger.info("---> urCommandBufferAppendUSMFillExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferAppendUSMFillExp\n"); ur_result_t result = pfnAppendUSMFillExp( hCommandBuffer, pMemory, pPattern, patternSize, size, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); getContext()->notify_end(UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_FILL_EXP, "urCommandBufferAppendUSMFillExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_FILL_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_FILL_EXP, ¶ms); + logger.info(" <--- urCommandBufferAppendUSMFillExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -6609,8 +7406,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { auto pfnAppendMemBufferCopyExp = getContext()->urDdiTable.CommandBufferExp.pfnAppendMemBufferCopyExp; @@ -6628,26 +7436,36 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( &size, &numSyncPointsInWaitList, &pSyncPointWaitList, - &pSyncPoint}; + &numEventsInWaitList, + &phEventWaitList, + &pSyncPoint, + &phEvent, + &phCommand}; uint64_t instance = getContext()->notify_begin( UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_EXP, "urCommandBufferAppendMemBufferCopyExp", ¶ms); - getContext()->logger.info("---> urCommandBufferAppendMemBufferCopyExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferAppendMemBufferCopyExp\n"); ur_result_t result = pfnAppendMemBufferCopyExp( hCommandBuffer, hSrcMem, hDstMem, srcOffset, dstOffset, size, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); getContext()->notify_end( UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_EXP, "urCommandBufferAppendMemBufferCopyExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_EXP, + ¶ms); + logger.info( + " <--- urCommandBufferAppendMemBufferCopyExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -6667,8 +7485,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { auto pfnAppendMemBufferWriteExp = getContext()->urDdiTable.CommandBufferExp.pfnAppendMemBufferWriteExp; @@ -6685,26 +7514,36 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( &pSrc, &numSyncPointsInWaitList, &pSyncPointWaitList, - &pSyncPoint}; + &numEventsInWaitList, + &phEventWaitList, + &pSyncPoint, + &phEvent, + &phCommand}; uint64_t instance = getContext()->notify_begin( UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_EXP, "urCommandBufferAppendMemBufferWriteExp", ¶ms); - getContext()->logger.info("---> urCommandBufferAppendMemBufferWriteExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferAppendMemBufferWriteExp\n"); ur_result_t result = pfnAppendMemBufferWriteExp( hCommandBuffer, hBuffer, offset, size, pSrc, numSyncPointsInWaitList, - pSyncPointWaitList, pSyncPoint); + pSyncPointWaitList, numEventsInWaitList, phEventWaitList, pSyncPoint, + phEvent, phCommand); getContext()->notify_end( UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_EXP, "urCommandBufferAppendMemBufferWriteExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_EXP, + ¶ms); + logger.info( + " <--- urCommandBufferAppendMemBufferWriteExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -6723,8 +7562,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { auto pfnAppendMemBufferReadExp = getContext()->urDdiTable.CommandBufferExp.pfnAppendMemBufferReadExp; @@ -6741,26 +7591,36 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( &pDst, &numSyncPointsInWaitList, &pSyncPointWaitList, - &pSyncPoint}; + &numEventsInWaitList, + &phEventWaitList, + &pSyncPoint, + &phEvent, + &phCommand}; uint64_t instance = getContext()->notify_begin( UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_EXP, "urCommandBufferAppendMemBufferReadExp", ¶ms); - getContext()->logger.info("---> urCommandBufferAppendMemBufferReadExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferAppendMemBufferReadExp\n"); ur_result_t result = pfnAppendMemBufferReadExp( hCommandBuffer, hBuffer, offset, size, pDst, numSyncPointsInWaitList, - pSyncPointWaitList, pSyncPoint); + pSyncPointWaitList, numEventsInWaitList, phEventWaitList, pSyncPoint, + phEvent, phCommand); getContext()->notify_end( UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_EXP, "urCommandBufferAppendMemBufferReadExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_EXP, + ¶ms); + logger.info( + " <--- urCommandBufferAppendMemBufferReadExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -6787,8 +7647,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { auto pfnAppendMemBufferCopyRectExp = getContext()->urDdiTable.CommandBufferExp.pfnAppendMemBufferCopyRectExp; @@ -6810,28 +7681,39 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( &dstSlicePitch, &numSyncPointsInWaitList, &pSyncPointWaitList, - &pSyncPoint}; + &numEventsInWaitList, + &phEventWaitList, + &pSyncPoint, + &phEvent, + &phCommand}; uint64_t instance = getContext()->notify_begin( UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_RECT_EXP, "urCommandBufferAppendMemBufferCopyRectExp", ¶ms); - getContext()->logger.info("---> urCommandBufferAppendMemBufferCopyRectExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferAppendMemBufferCopyRectExp\n"); ur_result_t result = pfnAppendMemBufferCopyRectExp( hCommandBuffer, hSrcMem, hDstMem, srcOrigin, dstOrigin, region, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); getContext()->notify_end( UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_RECT_EXP, "urCommandBufferAppendMemBufferCopyRectExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_RECT_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, + UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_RECT_EXP, + ¶ms); + logger.info( + " <--- urCommandBufferAppendMemBufferCopyRectExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -6864,8 +7746,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { auto pfnAppendMemBufferWriteRectExp = getContext() @@ -6888,29 +7781,39 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( &pSrc, &numSyncPointsInWaitList, &pSyncPointWaitList, - &pSyncPoint}; + &numEventsInWaitList, + &phEventWaitList, + &pSyncPoint, + &phEvent, + &phCommand}; uint64_t instance = getContext()->notify_begin( UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_RECT_EXP, "urCommandBufferAppendMemBufferWriteRectExp", ¶ms); - getContext()->logger.info( - "---> urCommandBufferAppendMemBufferWriteRectExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferAppendMemBufferWriteRectExp\n"); ur_result_t result = pfnAppendMemBufferWriteRectExp( hCommandBuffer, hBuffer, bufferOffset, hostOffset, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); getContext()->notify_end( UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_RECT_EXP, "urCommandBufferAppendMemBufferWriteRectExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_RECT_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, + UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_RECT_EXP, + ¶ms); + logger.info( + " <--- urCommandBufferAppendMemBufferWriteRectExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -6941,8 +7844,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { auto pfnAppendMemBufferReadRectExp = getContext()->urDdiTable.CommandBufferExp.pfnAppendMemBufferReadRectExp; @@ -6964,28 +7878,39 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( &pDst, &numSyncPointsInWaitList, &pSyncPointWaitList, - &pSyncPoint}; + &numEventsInWaitList, + &phEventWaitList, + &pSyncPoint, + &phEvent, + &phCommand}; uint64_t instance = getContext()->notify_begin( UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_RECT_EXP, "urCommandBufferAppendMemBufferReadRectExp", ¶ms); - getContext()->logger.info("---> urCommandBufferAppendMemBufferReadRectExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferAppendMemBufferReadRectExp\n"); ur_result_t result = pfnAppendMemBufferReadRectExp( hCommandBuffer, hBuffer, bufferOffset, hostOffset, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); getContext()->notify_end( UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_RECT_EXP, "urCommandBufferAppendMemBufferReadRectExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_RECT_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, + UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_RECT_EXP, + ¶ms); + logger.info( + " <--- urCommandBufferAppendMemBufferReadRectExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -7006,8 +7931,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { auto pfnAppendMemBufferFillExp = getContext()->urDdiTable.CommandBufferExp.pfnAppendMemBufferFillExp; @@ -7025,26 +7961,36 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( &size, &numSyncPointsInWaitList, &pSyncPointWaitList, - &pSyncPoint}; + &numEventsInWaitList, + &phEventWaitList, + &pSyncPoint, + &phEvent, + &phCommand}; uint64_t instance = getContext()->notify_begin( UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_FILL_EXP, "urCommandBufferAppendMemBufferFillExp", ¶ms); - getContext()->logger.info("---> urCommandBufferAppendMemBufferFillExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferAppendMemBufferFillExp\n"); ur_result_t result = pfnAppendMemBufferFillExp( hCommandBuffer, hBuffer, pPattern, patternSize, offset, size, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); getContext()->notify_end( UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_FILL_EXP, "urCommandBufferAppendMemBufferFillExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_FILL_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_FILL_EXP, + ¶ms); + logger.info( + " <--- urCommandBufferAppendMemBufferFillExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -7062,8 +8008,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { auto pfnAppendUSMPrefetchExp = getContext()->urDdiTable.CommandBufferExp.pfnAppendUSMPrefetchExp; @@ -7079,25 +8036,35 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( &flags, &numSyncPointsInWaitList, &pSyncPointWaitList, - &pSyncPoint}; + &numEventsInWaitList, + &phEventWaitList, + &pSyncPoint, + &phEvent, + &phCommand}; uint64_t instance = getContext()->notify_begin( UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_PREFETCH_EXP, "urCommandBufferAppendUSMPrefetchExp", ¶ms); - getContext()->logger.info("---> urCommandBufferAppendUSMPrefetchExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferAppendUSMPrefetchExp\n"); ur_result_t result = pfnAppendUSMPrefetchExp( hCommandBuffer, pMemory, size, flags, numSyncPointsInWaitList, - pSyncPointWaitList, pSyncPoint); + pSyncPointWaitList, numEventsInWaitList, phEventWaitList, pSyncPoint, + phEvent, phCommand); getContext()->notify_end(UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_PREFETCH_EXP, "urCommandBufferAppendUSMPrefetchExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_PREFETCH_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_PREFETCH_EXP, + ¶ms); + logger.info(" <--- urCommandBufferAppendUSMPrefetchExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -7115,8 +8082,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { auto pfnAppendUSMAdviseExp = getContext()->urDdiTable.CommandBufferExp.pfnAppendUSMAdviseExp; @@ -7132,25 +8110,35 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( &advice, &numSyncPointsInWaitList, &pSyncPointWaitList, - &pSyncPoint}; + &numEventsInWaitList, + &phEventWaitList, + &pSyncPoint, + &phEvent, + &phCommand}; uint64_t instance = getContext()->notify_begin( UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_ADVISE_EXP, "urCommandBufferAppendUSMAdviseExp", ¶ms); - getContext()->logger.info("---> urCommandBufferAppendUSMAdviseExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferAppendUSMAdviseExp\n"); - ur_result_t result = pfnAppendUSMAdviseExp(hCommandBuffer, pMemory, size, - advice, numSyncPointsInWaitList, - pSyncPointWaitList, pSyncPoint); + ur_result_t result = pfnAppendUSMAdviseExp( + hCommandBuffer, pMemory, size, advice, numSyncPointsInWaitList, + pSyncPointWaitList, numEventsInWaitList, phEventWaitList, pSyncPoint, + phEvent, phCommand); getContext()->notify_end(UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_ADVISE_EXP, "urCommandBufferAppendUSMAdviseExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_ADVISE_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_ADVISE_EXP, + ¶ms); + logger.info(" <--- urCommandBufferAppendUSMAdviseExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -7169,7 +8157,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferEnqueueExp( ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command-buffer execution instance. + ///< command-buffer execution instance. If phEventWaitList and phEvent are + ///< not NULL, phEvent must not refer to an element of the phEventWaitList array. ) { auto pfnEnqueueExp = getContext()->urDdiTable.CommandBufferExp.pfnEnqueueExp; @@ -7185,7 +8174,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferEnqueueExp( getContext()->notify_begin(UR_FUNCTION_COMMAND_BUFFER_ENQUEUE_EXP, "urCommandBufferEnqueueExp", ¶ms); - getContext()->logger.info("---> urCommandBufferEnqueueExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferEnqueueExp\n"); ur_result_t result = pfnEnqueueExp( hCommandBuffer, hQueue, numEventsInWaitList, phEventWaitList, phEvent); @@ -7194,10 +8184,13 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferEnqueueExp( "urCommandBufferEnqueueExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_COMMAND_BUFFER_ENQUEUE_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_ENQUEUE_EXP, ¶ms); + logger.info(" <--- urCommandBufferEnqueueExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -7220,7 +8213,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferRetainCommandExp( UR_FUNCTION_COMMAND_BUFFER_RETAIN_COMMAND_EXP, "urCommandBufferRetainCommandExp", ¶ms); - getContext()->logger.info("---> urCommandBufferRetainCommandExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferRetainCommandExp\n"); ur_result_t result = pfnRetainCommandExp(hCommand); @@ -7228,10 +8222,13 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferRetainCommandExp( "urCommandBufferRetainCommandExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_COMMAND_BUFFER_RETAIN_COMMAND_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_RETAIN_COMMAND_EXP, ¶ms); + logger.info(" <--- urCommandBufferRetainCommandExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -7254,7 +8251,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferReleaseCommandExp( UR_FUNCTION_COMMAND_BUFFER_RELEASE_COMMAND_EXP, "urCommandBufferReleaseCommandExp", ¶ms); - getContext()->logger.info("---> urCommandBufferReleaseCommandExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferReleaseCommandExp\n"); ur_result_t result = pfnReleaseCommandExp(hCommand); @@ -7262,10 +8260,13 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferReleaseCommandExp( "urCommandBufferReleaseCommandExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_COMMAND_BUFFER_RELEASE_COMMAND_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_RELEASE_COMMAND_EXP, ¶ms); + logger.info(" <--- urCommandBufferReleaseCommandExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -7291,7 +8292,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( UR_FUNCTION_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_EXP, "urCommandBufferUpdateKernelLaunchExp", ¶ms); - getContext()->logger.info("---> urCommandBufferUpdateKernelLaunchExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferUpdateKernelLaunchExp\n"); ur_result_t result = pfnUpdateKernelLaunchExp(hCommand, pUpdateKernelLaunch); @@ -7300,10 +8302,101 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( UR_FUNCTION_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_EXP, "urCommandBufferUpdateKernelLaunchExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_EXP, + ¶ms); + logger.info(" <--- urCommandBufferUpdateKernelLaunchExp({}) -> {};\n", + args_str.str(), result); + } + + return result; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urCommandBufferUpdateSignalEventExp +__urdlllocal ur_result_t UR_APICALL urCommandBufferUpdateSignalEventExp( + ur_exp_command_buffer_command_handle_t + hCommand, ///< [in] Handle of the command-buffer command to update. + ur_event_handle_t *phSignalEvent ///< [out] Event to be signaled. +) { + auto pfnUpdateSignalEventExp = + getContext()->urDdiTable.CommandBufferExp.pfnUpdateSignalEventExp; + + if (nullptr == pfnUpdateSignalEventExp) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + ur_command_buffer_update_signal_event_exp_params_t params = { + &hCommand, &phSignalEvent}; + uint64_t instance = getContext()->notify_begin( + UR_FUNCTION_COMMAND_BUFFER_UPDATE_SIGNAL_EVENT_EXP, + "urCommandBufferUpdateSignalEventExp", ¶ms); + + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferUpdateSignalEventExp\n"); + + ur_result_t result = pfnUpdateSignalEventExp(hCommand, phSignalEvent); + + getContext()->notify_end(UR_FUNCTION_COMMAND_BUFFER_UPDATE_SIGNAL_EVENT_EXP, + "urCommandBufferUpdateSignalEventExp", ¶ms, + &result, instance); + + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_UPDATE_SIGNAL_EVENT_EXP, + ¶ms); + logger.info(" <--- urCommandBufferUpdateSignalEventExp({}) -> {};\n", + args_str.str(), result); + } + + return result; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urCommandBufferUpdateWaitEventsExp +__urdlllocal ur_result_t UR_APICALL urCommandBufferUpdateWaitEventsExp( + ur_exp_command_buffer_command_handle_t + hCommand, ///< [in] Handle of the command-buffer command to update. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. +) { + auto pfnUpdateWaitEventsExp = + getContext()->urDdiTable.CommandBufferExp.pfnUpdateWaitEventsExp; + + if (nullptr == pfnUpdateWaitEventsExp) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + ur_command_buffer_update_wait_events_exp_params_t params = { + &hCommand, &numEventsInWaitList, &phEventWaitList}; + uint64_t instance = getContext()->notify_begin( + UR_FUNCTION_COMMAND_BUFFER_UPDATE_WAIT_EVENTS_EXP, + "urCommandBufferUpdateWaitEventsExp", ¶ms); + + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferUpdateWaitEventsExp\n"); + + ur_result_t result = + pfnUpdateWaitEventsExp(hCommand, numEventsInWaitList, phEventWaitList); + + getContext()->notify_end(UR_FUNCTION_COMMAND_BUFFER_UPDATE_WAIT_EVENTS_EXP, + "urCommandBufferUpdateWaitEventsExp", ¶ms, + &result, instance); + + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_UPDATE_WAIT_EVENTS_EXP, + ¶ms); + logger.info(" <--- urCommandBufferUpdateWaitEventsExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -7336,7 +8429,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferGetInfoExp( getContext()->notify_begin(UR_FUNCTION_COMMAND_BUFFER_GET_INFO_EXP, "urCommandBufferGetInfoExp", ¶ms); - getContext()->logger.info("---> urCommandBufferGetInfoExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferGetInfoExp\n"); ur_result_t result = pfnGetInfoExp(hCommandBuffer, propName, propSize, pPropValue, pPropSizeRet); @@ -7345,10 +8439,13 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferGetInfoExp( "urCommandBufferGetInfoExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_COMMAND_BUFFER_GET_INFO_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_GET_INFO_EXP, ¶ms); + logger.info(" <--- urCommandBufferGetInfoExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -7381,7 +8478,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCommandGetInfoExp( UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP, "urCommandBufferCommandGetInfoExp", ¶ms); - getContext()->logger.info("---> urCommandBufferCommandGetInfoExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urCommandBufferCommandGetInfoExp\n"); ur_result_t result = pfnCommandGetInfoExp(hCommand, propName, propSize, pPropValue, pPropSizeRet); @@ -7390,10 +8488,13 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCommandGetInfoExp( "urCommandBufferCommandGetInfoExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP, ¶ms); + logger.info(" <--- urCommandBufferCommandGetInfoExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -7417,17 +8518,16 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that ///< specify the number of local work-items forming a work-group that will ///< execute the kernel function. - ///< If nullptr, the runtime implementation will choose the work-group - ///< size. + ///< If nullptr, the runtime implementation will choose the work-group size. uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { auto pfnCooperativeKernelLaunchExp = getContext()->urDdiTable.EnqueueExp.pfnCooperativeKernelLaunchExp; @@ -7450,7 +8550,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP, "urEnqueueCooperativeKernelLaunchExp", ¶ms); - getContext()->logger.info("---> urEnqueueCooperativeKernelLaunchExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueCooperativeKernelLaunchExp\n"); ur_result_t result = pfnCooperativeKernelLaunchExp( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, @@ -7460,10 +8561,14 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( "urEnqueueCooperativeKernelLaunchExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP, + ¶ms); + logger.info(" <--- urEnqueueCooperativeKernelLaunchExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -7494,8 +8599,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP, "urKernelSuggestMaxCooperativeGroupCountExp", ¶ms); - getContext()->logger.info( - "---> urKernelSuggestMaxCooperativeGroupCountExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urKernelSuggestMaxCooperativeGroupCountExp\n"); ur_result_t result = pfnSuggestMaxCooperativeGroupCountExp( hKernel, localWorkSize, dynamicSharedMemorySize, pGroupCountRet); @@ -7505,11 +8610,16 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( "urKernelSuggestMaxCooperativeGroupCountExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, + UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP, + ¶ms); + logger.info( + " <--- urKernelSuggestMaxCooperativeGroupCountExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -7526,8 +8636,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait - ///< events. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [in,out] return an event object that identifies this particular kernel ///< execution instance. Profiling information can be queried @@ -7535,7 +8644,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` ///< reports the timestamp at the time of the call to this function. ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` - ///< reports the timestamp recorded when the command is executed on the device. + ///< reports the timestamp recorded when the command is executed on the + ///< device. If phEventWaitList and phEvent are not NULL, phEvent must not + ///< refer to an element of the phEventWaitList array. ) { auto pfnTimestampRecordingExp = getContext()->urDdiTable.EnqueueExp.pfnTimestampRecordingExp; @@ -7550,7 +8661,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( getContext()->notify_begin(UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP, "urEnqueueTimestampRecordingExp", ¶ms); - getContext()->logger.info("---> urEnqueueTimestampRecordingExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueTimestampRecordingExp\n"); ur_result_t result = pfnTimestampRecordingExp( hQueue, blocking, numEventsInWaitList, phEventWaitList, phEvent); @@ -7559,10 +8671,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( "urEnqueueTimestampRecordingExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP, ¶ms); + logger.info(" <--- urEnqueueTimestampRecordingExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -7595,7 +8710,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( ///< the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList + ///< array. ) { auto pfnKernelLaunchCustomExp = getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchCustomExp; @@ -7614,7 +8731,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( getContext()->notify_begin(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP, "urEnqueueKernelLaunchCustomExp", ¶ms); - getContext()->logger.info("---> urEnqueueKernelLaunchCustomExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueKernelLaunchCustomExp\n"); ur_result_t result = pfnKernelLaunchCustomExp( hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, @@ -7625,10 +8743,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( "urEnqueueKernelLaunchCustomExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP, ¶ms); + logger.info(" <--- urEnqueueKernelLaunchCustomExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -7654,17 +8775,21 @@ __urdlllocal ur_result_t UR_APICALL urProgramBuildExp( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_PROGRAM_BUILD_EXP, "urProgramBuildExp", ¶ms); - getContext()->logger.info("---> urProgramBuildExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urProgramBuildExp\n"); ur_result_t result = pfnBuildExp(hProgram, numDevices, phDevices, pOptions); getContext()->notify_end(UR_FUNCTION_PROGRAM_BUILD_EXP, "urProgramBuildExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_BUILD_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_BUILD_EXP, + ¶ms); + logger.info(" <--- urProgramBuildExp({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -7691,7 +8816,8 @@ __urdlllocal ur_result_t UR_APICALL urProgramCompileExp( uint64_t instance = getContext()->notify_begin( UR_FUNCTION_PROGRAM_COMPILE_EXP, "urProgramCompileExp", ¶ms); - getContext()->logger.info("---> urProgramCompileExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urProgramCompileExp\n"); ur_result_t result = pfnCompileExp(hProgram, numDevices, phDevices, pOptions); @@ -7699,10 +8825,13 @@ __urdlllocal ur_result_t UR_APICALL urProgramCompileExp( getContext()->notify_end(UR_FUNCTION_PROGRAM_COMPILE_EXP, "urProgramCompileExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_COMPILE_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PROGRAM_COMPILE_EXP, ¶ms); + logger.info(" <--- urProgramCompileExp({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -7737,7 +8866,8 @@ __urdlllocal ur_result_t UR_APICALL urProgramLinkExp( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_PROGRAM_LINK_EXP, "urProgramLinkExp", ¶ms); - getContext()->logger.info("---> urProgramLinkExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urProgramLinkExp\n"); ur_result_t result = pfnLinkExp(hContext, numDevices, phDevices, count, phPrograms, pOptions, phProgram); @@ -7745,10 +8875,13 @@ __urdlllocal ur_result_t UR_APICALL urProgramLinkExp( getContext()->notify_end(UR_FUNCTION_PROGRAM_LINK_EXP, "urProgramLinkExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_LINK_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_LINK_EXP, + ¶ms); + logger.info(" <--- urProgramLinkExp({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -7770,17 +8903,21 @@ __urdlllocal ur_result_t UR_APICALL urUSMImportExp( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_USM_IMPORT_EXP, "urUSMImportExp", ¶ms); - getContext()->logger.info("---> urUSMImportExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urUSMImportExp\n"); ur_result_t result = pfnImportExp(hContext, pMem, size); getContext()->notify_end(UR_FUNCTION_USM_IMPORT_EXP, "urUSMImportExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_IMPORT_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_IMPORT_EXP, + ¶ms); + logger.info(" <--- urUSMImportExp({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -7801,17 +8938,21 @@ __urdlllocal ur_result_t UR_APICALL urUSMReleaseExp( uint64_t instance = getContext()->notify_begin(UR_FUNCTION_USM_RELEASE_EXP, "urUSMReleaseExp", ¶ms); - getContext()->logger.info("---> urUSMReleaseExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urUSMReleaseExp\n"); ur_result_t result = pfnReleaseExp(hContext, pMem); getContext()->notify_end(UR_FUNCTION_USM_RELEASE_EXP, "urUSMReleaseExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_RELEASE_EXP, - ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_RELEASE_EXP, + ¶ms); + logger.info(" <--- urUSMReleaseExp({}) -> {};\n", args_str.str(), + result); + } return result; } @@ -7836,7 +8977,8 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PEnablePeerAccessExp( getContext()->notify_begin(UR_FUNCTION_USM_P2P_ENABLE_PEER_ACCESS_EXP, "urUsmP2PEnablePeerAccessExp", ¶ms); - getContext()->logger.info("---> urUsmP2PEnablePeerAccessExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urUsmP2PEnablePeerAccessExp\n"); ur_result_t result = pfnEnablePeerAccessExp(commandDevice, peerDevice); @@ -7844,10 +8986,13 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PEnablePeerAccessExp( "urUsmP2PEnablePeerAccessExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_USM_P2P_ENABLE_PEER_ACCESS_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_USM_P2P_ENABLE_PEER_ACCESS_EXP, ¶ms); + logger.info(" <--- urUsmP2PEnablePeerAccessExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -7872,7 +9017,8 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PDisablePeerAccessExp( getContext()->notify_begin(UR_FUNCTION_USM_P2P_DISABLE_PEER_ACCESS_EXP, "urUsmP2PDisablePeerAccessExp", ¶ms); - getContext()->logger.info("---> urUsmP2PDisablePeerAccessExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urUsmP2PDisablePeerAccessExp\n"); ur_result_t result = pfnDisablePeerAccessExp(commandDevice, peerDevice); @@ -7880,10 +9026,13 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PDisablePeerAccessExp( "urUsmP2PDisablePeerAccessExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_USM_P2P_DISABLE_PEER_ACCESS_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_USM_P2P_DISABLE_PEER_ACCESS_EXP, ¶ms); + logger.info(" <--- urUsmP2PDisablePeerAccessExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -7920,7 +9069,8 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( getContext()->notify_begin(UR_FUNCTION_USM_P2P_PEER_ACCESS_GET_INFO_EXP, "urUsmP2PPeerAccessGetInfoExp", ¶ms); - getContext()->logger.info("---> urUsmP2PPeerAccessGetInfoExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urUsmP2PPeerAccessGetInfoExp\n"); ur_result_t result = pfnPeerAccessGetInfoExp(commandDevice, peerDevice, propName, propSize, @@ -7930,10 +9080,13 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( "urUsmP2PPeerAccessGetInfoExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_USM_P2P_PEER_ACCESS_GET_INFO_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_USM_P2P_PEER_ACCESS_GET_INFO_EXP, ¶ms); + logger.info(" <--- urUsmP2PPeerAccessGetInfoExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -7961,7 +9114,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies the work that has - ///< been enqueued in nativeEnqueueFunc. + ///< been enqueued in nativeEnqueueFunc. If phEventWaitList and phEvent are + ///< not NULL, phEvent must not refer to an element of the phEventWaitList array. ) { auto pfnNativeCommandExp = getContext()->urDdiTable.EnqueueExp.pfnNativeCommandExp; @@ -7983,7 +9137,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( getContext()->notify_begin(UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP, "urEnqueueNativeCommandExp", ¶ms); - getContext()->logger.info("---> urEnqueueNativeCommandExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urEnqueueNativeCommandExp\n"); ur_result_t result = pfnNativeCommandExp( hQueue, pfnNativeEnqueue, data, numMemsInMemList, phMemList, @@ -7993,10 +9148,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( "urEnqueueNativeCommandExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP, ¶ms); + logger.info(" <--- urEnqueueNativeCommandExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -8064,7 +9222,8 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp( getContext()->notify_begin(UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP, "urTensorMapEncodeIm2ColExp", ¶ms); - getContext()->logger.info("---> urTensorMapEncodeIm2ColExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urTensorMapEncodeIm2ColExp\n"); ur_result_t result = pfnEncodeIm2ColExp( hDevice, TensorMapType, TensorRank, GlobalAddress, GlobalDim, @@ -8076,10 +9235,13 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp( "urTensorMapEncodeIm2ColExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_TENSOR_MAP_ENCODE_IM_2_COL_EXP, ¶ms); + logger.info(" <--- urTensorMapEncodeIm2ColExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -8134,7 +9296,8 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeTiledExp( getContext()->notify_begin(UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP, "urTensorMapEncodeTiledExp", ¶ms); - getContext()->logger.info("---> urTensorMapEncodeTiledExp"); + auto &logger = getContext()->logger; + logger.info(" ---> urTensorMapEncodeTiledExp\n"); ur_result_t result = pfnEncodeTiledExp( hDevice, TensorMapType, TensorRank, GlobalAddress, GlobalDim, @@ -8145,10 +9308,13 @@ __urdlllocal ur_result_t UR_APICALL urTensorMapEncodeTiledExp( "urTensorMapEncodeTiledExp", ¶ms, &result, instance); - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP, ¶ms); - getContext()->logger.info("({}) -> {};\n", args_str.str(), result); + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_TENSOR_MAP_ENCODE_TILED_EXP, ¶ms); + logger.info(" <--- urTensorMapEncodeTiledExp({}) -> {};\n", + args_str.str(), result); + } return result; } @@ -8275,9 +9441,15 @@ __urdlllocal ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable( pDdiTable->pfnMapExternalArrayExp = ur_tracing_layer::urBindlessImagesMapExternalArrayExp; - dditable.pfnReleaseInteropExp = pDdiTable->pfnReleaseInteropExp; - pDdiTable->pfnReleaseInteropExp = - ur_tracing_layer::urBindlessImagesReleaseInteropExp; + dditable.pfnMapExternalLinearMemoryExp = + pDdiTable->pfnMapExternalLinearMemoryExp; + pDdiTable->pfnMapExternalLinearMemoryExp = + ur_tracing_layer::urBindlessImagesMapExternalLinearMemoryExp; + + dditable.pfnReleaseExternalMemoryExp = + pDdiTable->pfnReleaseExternalMemoryExp; + pDdiTable->pfnReleaseExternalMemoryExp = + ur_tracing_layer::urBindlessImagesReleaseExternalMemoryExp; dditable.pfnImportExternalSemaphoreExp = pDdiTable->pfnImportExternalSemaphoreExp; @@ -8408,6 +9580,14 @@ __urdlllocal ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( pDdiTable->pfnUpdateKernelLaunchExp = ur_tracing_layer::urCommandBufferUpdateKernelLaunchExp; + dditable.pfnUpdateSignalEventExp = pDdiTable->pfnUpdateSignalEventExp; + pDdiTable->pfnUpdateSignalEventExp = + ur_tracing_layer::urCommandBufferUpdateSignalEventExp; + + dditable.pfnUpdateWaitEventsExp = pDdiTable->pfnUpdateWaitEventsExp; + pDdiTable->pfnUpdateWaitEventsExp = + ur_tracing_layer::urCommandBufferUpdateWaitEventsExp; + dditable.pfnGetInfoExp = pDdiTable->pfnGetInfoExp; pDdiTable->pfnGetInfoExp = ur_tracing_layer::urCommandBufferGetInfoExp; diff --git a/source/loader/layers/validation/ur_leak_check.hpp b/source/loader/layers/validation/ur_leak_check.hpp index 56998797a3..7ce5415d96 100644 --- a/source/loader/layers/validation/ur_leak_check.hpp +++ b/source/loader/layers/validation/ur_leak_check.hpp @@ -109,7 +109,7 @@ struct RefCountContext { // No more active adapters, so any references still held are leaked if (adapterCount == 0) { logInvalidReferences(); - clear(); + counts.clear(); } } @@ -133,9 +133,8 @@ struct RefCountContext { updateRefCount(handle, REFCOUNT_CREATE_OR_INCREASE, isAdapterHandle); } - void clear() { counts.clear(); } - template bool isReferenceValid(T handle) { + std::unique_lock lock(mutex); auto it = counts.find(static_cast(handle)); if (it == counts.end() || it->second.refCount < 1) { return false; diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index 1437cab570..019176b349 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -25,7 +25,7 @@ __urdlllocal ur_result_t UR_APICALL urAdapterGet( ur_adapter_handle_t * phAdapters, ///< [out][optional][range(0, NumEntries)] array of handle of adapters. ///< If NumEntries is less than the number of adapters available, then - ///< ::urAdapterGet shall only retrieve that number of platforms. + ///< ::urAdapterGet shall only retrieve that number of adapters. uint32_t * pNumAdapters ///< [out][optional] returns the total number of adapters available. ) { @@ -36,6 +36,9 @@ __urdlllocal ur_result_t UR_APICALL urAdapterGet( } if (getContext()->enableParameterValidation) { + if (NumEntries == 0 && phAdapters != NULL) { + return UR_RESULT_ERROR_INVALID_SIZE; + } } ur_result_t result = pfnAdapterGet(NumEntries, phAdapters, pNumAdapters); @@ -719,7 +722,8 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetNativeHandle( __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_native_handle_t hNativeDevice, ///< [in][nocheck] the native handle of the device. - ur_platform_handle_t hPlatform, ///< [in] handle of the platform instance + ur_adapter_handle_t + hAdapter, ///< [in] handle of the adapter to which `hNativeDevice` belongs const ur_device_native_properties_t * pProperties, ///< [in][optional] pointer to native device properties struct. ur_device_handle_t @@ -733,7 +737,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( } if (getContext()->enableParameterValidation) { - if (NULL == hPlatform) { + if (NULL == hAdapter) { return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } @@ -742,7 +746,12 @@ __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( } } - ur_result_t result = pfnCreateWithNativeHandle(hNativeDevice, hPlatform, + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hAdapter)) { + getContext()->refCountContext->logInvalidReference(hAdapter); + } + + ur_result_t result = pfnCreateWithNativeHandle(hNativeDevice, hAdapter, pProperties, phDevice); if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS) { @@ -3350,6 +3359,7 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgValue( *pProperties, ///< [in][optional] pointer to value properties. const void *pArgValue ///< [in] argument value represented as matching arg type. + ///< The data pointed to will be copied and therefore can be reused on return. ) { auto pfnSetArgValue = getContext()->urDdiTable.Kernel.pfnSetArgValue; @@ -3497,7 +3507,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetGroupInfo( return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } - if (UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE < propName) { + if (UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE < + propName) { return UR_RESULT_ERROR_INVALID_ENUMERATION; } } @@ -4597,17 +4608,16 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that ///< specify the number of local work-items forming a work-group that will ///< execute the kernel function. - ///< If nullptr, the runtime implementation will choose the work-group - ///< size. + ///< If nullptr, the runtime implementation will choose the work-group size. uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { auto pfnKernelLaunch = getContext()->urDdiTable.Enqueue.pfnKernelLaunch; @@ -4679,7 +4689,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWait( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnEventsWait = getContext()->urDdiTable.Enqueue.pfnEventsWait; @@ -4733,7 +4744,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnEventsWaitWithBarrier = getContext()->urDdiTable.Enqueue.pfnEventsWaitWithBarrier; @@ -4793,7 +4805,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferRead( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnMemBufferRead = getContext()->urDdiTable.Enqueue.pfnMemBufferRead; @@ -4822,9 +4835,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferRead( return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; } - if (auto boundsError = bounds(hBuffer, offset, size); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hBuffer, offset, size); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -4873,7 +4888,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWrite( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnMemBufferWrite = getContext()->urDdiTable.Enqueue.pfnMemBufferWrite; @@ -4902,9 +4918,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWrite( return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; } - if (auto boundsError = bounds(hBuffer, offset, size); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hBuffer, offset, size); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -4963,7 +4981,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferReadRect( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnMemBufferReadRect = getContext()->urDdiTable.Enqueue.pfnMemBufferReadRect; @@ -5033,9 +5052,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferReadRect( return UR_RESULT_ERROR_INVALID_SIZE; } - if (auto boundsError = bounds(hBuffer, bufferOrigin, region); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hBuffer, bufferOrigin, region); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -5098,7 +5119,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnMemBufferWriteRect = getContext()->urDdiTable.Enqueue.pfnMemBufferWriteRect; @@ -5168,9 +5190,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( return UR_RESULT_ERROR_INVALID_SIZE; } - if (auto boundsError = bounds(hBuffer, bufferOrigin, region); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hBuffer, bufferOrigin, region); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -5219,7 +5243,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopy( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnMemBufferCopy = getContext()->urDdiTable.Enqueue.pfnMemBufferCopy; @@ -5248,14 +5273,18 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopy( return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; } - if (auto boundsError = bounds(hBufferSrc, srcOffset, size); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hBufferSrc, srcOffset, size); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } - if (auto boundsError = bounds(hBufferDst, dstOffset, size); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hBufferDst, dstOffset, size); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -5317,7 +5346,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnMemBufferCopyRect = getContext()->urDdiTable.Enqueue.pfnMemBufferCopyRect; @@ -5383,14 +5413,18 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( return UR_RESULT_ERROR_INVALID_SIZE; } - if (auto boundsError = bounds(hBufferSrc, srcOrigin, region); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hBufferSrc, srcOrigin, region); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } - if (auto boundsError = bounds(hBufferDst, dstOrigin, region); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hBufferDst, dstOrigin, region); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -5443,7 +5477,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferFill( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnMemBufferFill = getContext()->urDdiTable.Enqueue.pfnMemBufferFill; @@ -5492,9 +5527,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferFill( return UR_RESULT_ERROR_INVALID_SIZE; } - if (auto boundsError = bounds(hBuffer, offset, size); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hBuffer, offset, size); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -5546,7 +5583,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageRead( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnMemImageRead = getContext()->urDdiTable.Enqueue.pfnMemImageRead; @@ -5579,9 +5617,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageRead( return UR_RESULT_ERROR_INVALID_SIZE; } - if (auto boundsError = boundsImage(hImage, origin, region); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = boundsImage(hImage, origin, region); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -5634,7 +5674,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageWrite( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnMemImageWrite = getContext()->urDdiTable.Enqueue.pfnMemImageWrite; @@ -5667,9 +5708,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageWrite( return UR_RESULT_ERROR_INVALID_SIZE; } - if (auto boundsError = boundsImage(hImage, origin, region); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = boundsImage(hImage, origin, region); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -5723,7 +5766,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageCopy( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnMemImageCopy = getContext()->urDdiTable.Enqueue.pfnMemImageCopy; @@ -5756,14 +5800,18 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageCopy( return UR_RESULT_ERROR_INVALID_SIZE; } - if (auto boundsError = boundsImage(hImageSrc, srcOrigin, region); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = boundsImage(hImageSrc, srcOrigin, region); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } - if (auto boundsError = boundsImage(hImageDst, dstOrigin, region); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = boundsImage(hImageDst, dstOrigin, region); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -5815,7 +5863,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferMap( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent, ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. void **ppRetMap ///< [out] return mapped pointer. TODO: move it before ///< numEventsInWaitList? ) { @@ -5850,9 +5899,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferMap( return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; } - if (auto boundsError = bounds(hBuffer, offset, size); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hBuffer, offset, size); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -5896,7 +5947,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemUnmap( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnMemUnmap = getContext()->urDdiTable.Enqueue.pfnMemUnmap; @@ -5971,7 +6023,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnUSMFill = getContext()->urDdiTable.Enqueue.pfnUSMFill; @@ -6012,9 +6065,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill( return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; } - if (auto boundsError = bounds(hQueue, pMem, 0, size); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hQueue, pMem, 0, size); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -6056,7 +6111,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnUSMMemcpy = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy; @@ -6089,14 +6145,18 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy( return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; } - if (auto boundsError = bounds(hQueue, pDst, 0, size); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hQueue, pDst, 0, size); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } - if (auto boundsError = bounds(hQueue, pSrc, 0, size); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hQueue, pSrc, 0, size); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -6136,7 +6196,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMPrefetch( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnUSMPrefetch = getContext()->urDdiTable.Enqueue.pfnUSMPrefetch; @@ -6169,9 +6230,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMPrefetch( return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; } - if (auto boundsError = bounds(hQueue, pMem, 0, size); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hQueue, pMem, 0, size); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -6230,9 +6293,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMAdvise( return UR_RESULT_ERROR_INVALID_SIZE; } - if (auto boundsError = bounds(hQueue, pMem, 0, size); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hQueue, pMem, 0, size); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } } @@ -6267,11 +6332,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill2D( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { auto pfnUSMFill2D = getContext()->urDdiTable.Enqueue.pfnUSMFill2D; @@ -6332,9 +6397,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill2D( return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; } - if (auto boundsError = bounds(hQueue, pMem, 0, pitch * height); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hQueue, pMem, 0, pitch * height); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -6378,11 +6445,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { auto pfnUSMMemcpy2D = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy2D; @@ -6431,14 +6498,18 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; } - if (auto boundsError = bounds(hQueue, pDst, 0, dstPitch * height); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hQueue, pDst, 0, dstPitch * height); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } - if (auto boundsError = bounds(hQueue, pSrc, 0, srcPitch * height); - boundsError != UR_RESULT_SUCCESS) { - return boundsError; + if (getContext()->enableBoundsChecking) { + if (auto boundsError = bounds(hQueue, pSrc, 0, srcPitch * height); + boundsError != UR_RESULT_SUCCESS) { + return boundsError; + } } if (phEventWaitList != NULL && numEventsInWaitList > 0) { @@ -6479,11 +6550,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { auto pfnDeviceGlobalVariableWrite = getContext()->urDdiTable.Enqueue.pfnDeviceGlobalVariableWrite; @@ -6560,11 +6631,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { auto pfnDeviceGlobalVariableRead = getContext()->urDdiTable.Enqueue.pfnDeviceGlobalVariableRead; @@ -6647,9 +6718,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueReadHostPipe( ///< events that must be complete before the host pipe read. ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * - phEvent ///< [out][optional] returns an event object that identifies this read - ///< command + phEvent ///< [out][optional] returns an event object that identifies this read command ///< and can be used to query or queue a wait for this command to complete. + ///< If phEventWaitList and phEvent are not NULL, phEvent must not refer to + ///< an element of the phEventWaitList array. ) { auto pfnReadHostPipe = getContext()->urDdiTable.Enqueue.pfnReadHostPipe; @@ -6734,6 +6806,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueWriteHostPipe( ur_event_handle_t * phEvent ///< [out][optional] returns an event object that identifies this write command ///< and can be used to query or queue a wait for this command to complete. + ///< If phEventWaitList and phEvent are not NULL, phEvent must not refer to + ///< an element of the phEventWaitList array. ) { auto pfnWriteHostPipe = getContext()->urDdiTable.Enqueue.pfnWriteHostPipe; @@ -7216,7 +7290,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageCopyExp( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnImageCopyExp = getContext()->urDdiTable.BindlessImagesExp.pfnImageCopyExp; @@ -7432,10 +7507,10 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( size_t size, ///< [in] size of the external memory ur_exp_external_mem_type_t memHandleType, ///< [in] type of external memory handle - ur_exp_interop_mem_desc_t - *pInteropMemDesc, ///< [in] the interop memory descriptor - ur_exp_interop_mem_handle_t - *phInteropMem ///< [out] interop memory handle to the external memory + ur_exp_external_mem_desc_t + *pExternalMemDesc, ///< [in] the external memory descriptor + ur_exp_external_mem_handle_t + *phExternalMem ///< [out] external memory handle to the external memory ) { auto pfnImportExternalMemoryExp = getContext()->urDdiTable.BindlessImagesExp.pfnImportExternalMemoryExp; @@ -7453,11 +7528,11 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } - if (NULL == pInteropMemDesc) { + if (NULL == pExternalMemDesc) { return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - if (NULL == phInteropMem) { + if (NULL == phExternalMem) { return UR_RESULT_ERROR_INVALID_NULL_POINTER; } @@ -7476,8 +7551,9 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( getContext()->refCountContext->logInvalidReference(hDevice); } - ur_result_t result = pfnImportExternalMemoryExp( - hContext, hDevice, size, memHandleType, pInteropMemDesc, phInteropMem); + ur_result_t result = + pfnImportExternalMemoryExp(hContext, hDevice, size, memHandleType, + pExternalMemDesc, phExternalMem); return result; } @@ -7490,8 +7566,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( const ur_image_format_t *pImageFormat, ///< [in] pointer to image format specification const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description - ur_exp_interop_mem_handle_t - hInteropMem, ///< [in] interop memory handle to the external memory + ur_exp_external_mem_handle_t + hExternalMem, ///< [in] external memory handle to the external memory ur_exp_image_mem_native_handle_t * phImageMem ///< [out] image memory handle to the externally allocated memory ) { @@ -7511,7 +7587,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } - if (NULL == hInteropMem) { + if (NULL == hExternalMem) { return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } @@ -7543,23 +7619,27 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( } ur_result_t result = pfnMapExternalArrayExp( - hContext, hDevice, pImageFormat, pImageDesc, hInteropMem, phImageMem); + hContext, hDevice, pImageFormat, pImageDesc, hExternalMem, phImageMem); return result; } /////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urBindlessImagesReleaseInteropExp -__urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( +/// @brief Intercept function for urBindlessImagesMapExternalLinearMemoryExp +__urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp( ur_context_handle_t hContext, ///< [in] handle of the context object ur_device_handle_t hDevice, ///< [in] handle of the device object - ur_exp_interop_mem_handle_t - hInteropMem ///< [in][release] handle of interop memory to be destroyed + uint64_t offset, ///< [in] offset into memory region to map + uint64_t size, ///< [in] size of memory region to map + ur_exp_external_mem_handle_t + hExternalMem, ///< [in] external memory handle to the external memory + void **ppRetMem ///< [out] pointer of the externally allocated memory ) { - auto pfnReleaseInteropExp = - getContext()->urDdiTable.BindlessImagesExp.pfnReleaseInteropExp; + auto pfnMapExternalLinearMemoryExp = + getContext() + ->urDdiTable.BindlessImagesExp.pfnMapExternalLinearMemoryExp; - if (nullptr == pfnReleaseInteropExp) { + if (nullptr == pfnMapExternalLinearMemoryExp) { return UR_RESULT_ERROR_UNINITIALIZED; } @@ -7572,9 +7652,13 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } - if (NULL == hInteropMem) { + if (NULL == hExternalMem) { return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } + + if (NULL == ppRetMem) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } } if (getContext()->enableLifetimeValidation && @@ -7587,7 +7671,53 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( getContext()->refCountContext->logInvalidReference(hDevice); } - ur_result_t result = pfnReleaseInteropExp(hContext, hDevice, hInteropMem); + ur_result_t result = pfnMapExternalLinearMemoryExp( + hContext, hDevice, offset, size, hExternalMem, ppRetMem); + + return result; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urBindlessImagesReleaseExternalMemoryExp +__urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp( + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_device_handle_t hDevice, ///< [in] handle of the device object + ur_exp_external_mem_handle_t + hExternalMem ///< [in][release] handle of external memory to be destroyed +) { + auto pfnReleaseExternalMemoryExp = + getContext()->urDdiTable.BindlessImagesExp.pfnReleaseExternalMemoryExp; + + if (nullptr == pfnReleaseExternalMemoryExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (getContext()->enableParameterValidation) { + if (NULL == hContext) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + if (NULL == hDevice) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + if (NULL == hExternalMem) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + } + + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hContext)) { + getContext()->refCountContext->logInvalidReference(hContext); + } + + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hDevice)) { + getContext()->refCountContext->logInvalidReference(hDevice); + } + + ur_result_t result = + pfnReleaseExternalMemoryExp(hContext, hDevice, hExternalMem); return result; } @@ -7599,10 +7729,10 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( ur_device_handle_t hDevice, ///< [in] handle of the device object ur_exp_external_semaphore_type_t semHandleType, ///< [in] type of external memory handle - ur_exp_interop_semaphore_desc_t - *pInteropSemaphoreDesc, ///< [in] the interop semaphore descriptor - ur_exp_interop_semaphore_handle_t * - phInteropSemaphore ///< [out] interop semaphore handle to the external semaphore + ur_exp_external_semaphore_desc_t + *pExternalSemaphoreDesc, ///< [in] the external semaphore descriptor + ur_exp_external_semaphore_handle_t * + phExternalSemaphore ///< [out] external semaphore handle to the external semaphore ) { auto pfnImportExternalSemaphoreExp = getContext() @@ -7621,11 +7751,11 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } - if (NULL == pInteropSemaphoreDesc) { + if (NULL == pExternalSemaphoreDesc) { return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - if (NULL == phInteropSemaphore) { + if (NULL == phExternalSemaphore) { return UR_RESULT_ERROR_INVALID_NULL_POINTER; } @@ -7646,8 +7776,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( } ur_result_t result = pfnImportExternalSemaphoreExp( - hContext, hDevice, semHandleType, pInteropSemaphoreDesc, - phInteropSemaphore); + hContext, hDevice, semHandleType, pExternalSemaphoreDesc, + phExternalSemaphore); return result; } @@ -7657,8 +7787,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( ur_context_handle_t hContext, ///< [in] handle of the context object ur_device_handle_t hDevice, ///< [in] handle of the device object - ur_exp_interop_semaphore_handle_t - hInteropSemaphore ///< [in][release] handle of interop semaphore to be destroyed + ur_exp_external_semaphore_handle_t + hExternalSemaphore ///< [in][release] handle of external semaphore to be destroyed ) { auto pfnReleaseExternalSemaphoreExp = getContext() @@ -7677,7 +7807,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } - if (NULL == hInteropSemaphore) { + if (NULL == hExternalSemaphore) { return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } } @@ -7693,7 +7823,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( } ur_result_t result = - pfnReleaseExternalSemaphoreExp(hContext, hDevice, hInteropSemaphore); + pfnReleaseExternalSemaphoreExp(hContext, hDevice, hExternalSemaphore); return result; } @@ -7702,8 +7832,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( /// @brief Intercept function for urBindlessImagesWaitExternalSemaphoreExp __urdlllocal ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( ur_queue_handle_t hQueue, ///< [in] handle of the queue object - ur_exp_interop_semaphore_handle_t - hSemaphore, ///< [in] interop semaphore handle + ur_exp_external_semaphore_handle_t + hSemaphore, ///< [in] external semaphore handle bool hasWaitValue, ///< [in] indicates whether the samephore is capable and should wait on a ///< certain value. @@ -7719,7 +7849,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnWaitExternalSemaphoreExp = getContext()->urDdiTable.BindlessImagesExp.pfnWaitExternalSemaphoreExp; @@ -7762,8 +7893,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( /// @brief Intercept function for urBindlessImagesSignalExternalSemaphoreExp __urdlllocal ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( ur_queue_handle_t hQueue, ///< [in] handle of the queue object - ur_exp_interop_semaphore_handle_t - hSemaphore, ///< [in] interop semaphore handle + ur_exp_external_semaphore_handle_t + hSemaphore, ///< [in] external semaphore handle bool hasSignalValue, ///< [in] indicates whether the samephore is capable and should signal on a ///< certain value. @@ -7779,7 +7910,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { auto pfnSignalExternalSemaphoreExp = getContext() @@ -7948,16 +8080,37 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( const size_t * pGlobalWorkSize, ///< [in] Global work size to use when executing kernel. const size_t * - pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. + pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. If this + ///< parameter is nullptr, then a local work size will be generated by the + ///< implementation. + uint32_t + numKernelAlternatives, ///< [in] The number of kernel alternatives provided in + ///< phKernelAlternatives. + ur_kernel_handle_t * + phKernelAlternatives, ///< [in][optional][range(0, numKernelAlternatives)] List of kernel handles + ///< that might be used to update the kernel in this + ///< command after the command-buffer is finalized. The default kernel + ///< `hKernel` is implicitly marked as an alternative. It's + ///< invalid to specify it as part of this list. uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * pSyncPoint, ///< [out][optional] Sync point associated with this command. - ur_exp_command_buffer_command_handle_t - *phCommand ///< [out][optional] Handle to this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t * + phCommand ///< [out][optional] Handle to this command. Only available if the + ///< command-buffer is updatable. ) { auto pfnAppendKernelLaunchExp = getContext()->urDdiTable.CommandBufferExp.pfnAppendKernelLaunchExp; @@ -7983,6 +8136,14 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( return UR_RESULT_ERROR_INVALID_NULL_POINTER; } + if (phKernelAlternatives == NULL && numKernelAlternatives > 0) { + return UR_RESULT_ERROR_INVALID_VALUE; + } + + if (phKernelAlternatives != NULL && numKernelAlternatives == 0) { + return UR_RESULT_ERROR_INVALID_VALUE; + } + if (pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0) { return UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP; } @@ -7990,6 +8151,22 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( if (pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0) { return UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP; } + + if (phEventWaitList == NULL && numEventsInWaitList > 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList == 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList > 0) { + for (uint32_t i = 0; i < numEventsInWaitList; ++i) { + if (phEventWaitList[i] == NULL) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + } + } } if (getContext()->enableLifetimeValidation && @@ -7999,8 +8176,9 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ur_result_t result = pfnAppendKernelLaunchExp( hCommandBuffer, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint, - phCommand); + pLocalWorkSize, numKernelAlternatives, phKernelAlternatives, + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); return result; } @@ -8018,8 +8196,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { auto pfnAppendUSMMemcpyExp = getContext()->urDdiTable.CommandBufferExp.pfnAppendUSMMemcpyExp; @@ -8052,11 +8241,28 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( if (pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0) { return UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP; } + + if (phEventWaitList == NULL && numEventsInWaitList > 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList == 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList > 0) { + for (uint32_t i = 0; i < numEventsInWaitList; ++i) { + if (phEventWaitList[i] == NULL) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + } + } } - ur_result_t result = pfnAppendUSMMemcpyExp(hCommandBuffer, pDst, pSrc, size, - numSyncPointsInWaitList, - pSyncPointWaitList, pSyncPoint); + ur_result_t result = pfnAppendUSMMemcpyExp( + hCommandBuffer, pDst, pSrc, size, numSyncPointsInWaitList, + pSyncPointWaitList, numEventsInWaitList, phEventWaitList, pSyncPoint, + phEvent, phCommand); return result; } @@ -8076,8 +8282,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { auto pfnAppendUSMFillExp = getContext()->urDdiTable.CommandBufferExp.pfnAppendUSMFillExp; @@ -8118,11 +8335,28 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( if (pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0) { return UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP; } + + if (phEventWaitList == NULL && numEventsInWaitList > 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList == 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList > 0) { + for (uint32_t i = 0; i < numEventsInWaitList; ++i) { + if (phEventWaitList[i] == NULL) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + } + } } ur_result_t result = pfnAppendUSMFillExp( hCommandBuffer, pMemory, pPattern, patternSize, size, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); return result; } @@ -8142,8 +8376,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { auto pfnAppendMemBufferCopyExp = getContext()->urDdiTable.CommandBufferExp.pfnAppendMemBufferCopyExp; @@ -8172,6 +8417,22 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( if (pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0) { return UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP; } + + if (phEventWaitList == NULL && numEventsInWaitList > 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList == 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList > 0) { + for (uint32_t i = 0; i < numEventsInWaitList; ++i) { + if (phEventWaitList[i] == NULL) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + } + } } if (getContext()->enableLifetimeValidation && @@ -8186,7 +8447,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( ur_result_t result = pfnAppendMemBufferCopyExp( hCommandBuffer, hSrcMem, hDstMem, srcOffset, dstOffset, size, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); return result; } @@ -8206,8 +8468,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { auto pfnAppendMemBufferWriteExp = getContext()->urDdiTable.CommandBufferExp.pfnAppendMemBufferWriteExp; @@ -8236,6 +8509,22 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( if (pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0) { return UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP; } + + if (phEventWaitList == NULL && numEventsInWaitList > 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList == 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList > 0) { + for (uint32_t i = 0; i < numEventsInWaitList; ++i) { + if (phEventWaitList[i] == NULL) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + } + } } if (getContext()->enableLifetimeValidation && @@ -8245,7 +8534,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( ur_result_t result = pfnAppendMemBufferWriteExp( hCommandBuffer, hBuffer, offset, size, pSrc, numSyncPointsInWaitList, - pSyncPointWaitList, pSyncPoint); + pSyncPointWaitList, numEventsInWaitList, phEventWaitList, pSyncPoint, + phEvent, phCommand); return result; } @@ -8264,8 +8554,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { auto pfnAppendMemBufferReadExp = getContext()->urDdiTable.CommandBufferExp.pfnAppendMemBufferReadExp; @@ -8294,6 +8595,22 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( if (pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0) { return UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP; } + + if (phEventWaitList == NULL && numEventsInWaitList > 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList == 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList > 0) { + for (uint32_t i = 0; i < numEventsInWaitList; ++i) { + if (phEventWaitList[i] == NULL) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + } + } } if (getContext()->enableLifetimeValidation && @@ -8303,7 +8620,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( ur_result_t result = pfnAppendMemBufferReadExp( hCommandBuffer, hBuffer, offset, size, pDst, numSyncPointsInWaitList, - pSyncPointWaitList, pSyncPoint); + pSyncPointWaitList, numEventsInWaitList, phEventWaitList, pSyncPoint, + phEvent, phCommand); return result; } @@ -8330,8 +8648,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { auto pfnAppendMemBufferCopyRectExp = getContext()->urDdiTable.CommandBufferExp.pfnAppendMemBufferCopyRectExp; @@ -8360,6 +8689,22 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( if (pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0) { return UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP; } + + if (phEventWaitList == NULL && numEventsInWaitList > 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList == 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList > 0) { + for (uint32_t i = 0; i < numEventsInWaitList; ++i) { + if (phEventWaitList[i] == NULL) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + } + } } if (getContext()->enableLifetimeValidation && @@ -8375,7 +8720,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( ur_result_t result = pfnAppendMemBufferCopyRectExp( hCommandBuffer, hSrcMem, hDstMem, srcOrigin, dstOrigin, region, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); return result; } @@ -8408,8 +8754,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { auto pfnAppendMemBufferWriteRectExp = getContext() @@ -8439,6 +8796,22 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( if (pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0) { return UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP; } + + if (phEventWaitList == NULL && numEventsInWaitList > 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList == 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList > 0) { + for (uint32_t i = 0; i < numEventsInWaitList; ++i) { + if (phEventWaitList[i] == NULL) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + } + } } if (getContext()->enableLifetimeValidation && @@ -8449,7 +8822,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( ur_result_t result = pfnAppendMemBufferWriteRectExp( hCommandBuffer, hBuffer, bufferOffset, hostOffset, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); return result; } @@ -8480,8 +8854,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { auto pfnAppendMemBufferReadRectExp = getContext()->urDdiTable.CommandBufferExp.pfnAppendMemBufferReadRectExp; @@ -8510,6 +8895,22 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( if (pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0) { return UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP; } + + if (phEventWaitList == NULL && numEventsInWaitList > 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList == 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList > 0) { + for (uint32_t i = 0; i < numEventsInWaitList; ++i) { + if (phEventWaitList[i] == NULL) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + } + } } if (getContext()->enableLifetimeValidation && @@ -8520,7 +8921,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( ur_result_t result = pfnAppendMemBufferReadRectExp( hCommandBuffer, hBuffer, bufferOffset, hostOffset, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); return result; } @@ -8541,8 +8943,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { auto pfnAppendMemBufferFillExp = getContext()->urDdiTable.CommandBufferExp.pfnAppendMemBufferFillExp; @@ -8571,6 +8984,22 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( if (pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0) { return UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP; } + + if (phEventWaitList == NULL && numEventsInWaitList > 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList == 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList > 0) { + for (uint32_t i = 0; i < numEventsInWaitList; ++i) { + if (phEventWaitList[i] == NULL) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + } + } } if (getContext()->enableLifetimeValidation && @@ -8580,7 +9009,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( ur_result_t result = pfnAppendMemBufferFillExp( hCommandBuffer, hBuffer, pPattern, patternSize, offset, size, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); return result; } @@ -8598,8 +9028,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { auto pfnAppendUSMPrefetchExp = getContext()->urDdiTable.CommandBufferExp.pfnAppendUSMPrefetchExp; @@ -8632,11 +9073,28 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( if (size == 0) { return UR_RESULT_ERROR_INVALID_SIZE; } + + if (phEventWaitList == NULL && numEventsInWaitList > 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList == 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList > 0) { + for (uint32_t i = 0; i < numEventsInWaitList; ++i) { + if (phEventWaitList[i] == NULL) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + } + } } ur_result_t result = pfnAppendUSMPrefetchExp( hCommandBuffer, pMemory, size, flags, numSyncPointsInWaitList, - pSyncPointWaitList, pSyncPoint); + pSyncPointWaitList, numEventsInWaitList, phEventWaitList, pSyncPoint, + phEvent, phCommand); return result; } @@ -8654,8 +9112,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { auto pfnAppendUSMAdviseExp = getContext()->urDdiTable.CommandBufferExp.pfnAppendUSMAdviseExp; @@ -8688,11 +9157,28 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( if (size == 0) { return UR_RESULT_ERROR_INVALID_SIZE; } + + if (phEventWaitList == NULL && numEventsInWaitList > 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList == 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList > 0) { + for (uint32_t i = 0; i < numEventsInWaitList; ++i) { + if (phEventWaitList[i] == NULL) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + } + } } - ur_result_t result = pfnAppendUSMAdviseExp(hCommandBuffer, pMemory, size, - advice, numSyncPointsInWaitList, - pSyncPointWaitList, pSyncPoint); + ur_result_t result = pfnAppendUSMAdviseExp( + hCommandBuffer, pMemory, size, advice, numSyncPointsInWaitList, + pSyncPointWaitList, numEventsInWaitList, phEventWaitList, pSyncPoint, + phEvent, phCommand); return result; } @@ -8711,7 +9197,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferEnqueueExp( ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command-buffer execution instance. + ///< command-buffer execution instance. If phEventWaitList and phEvent are + ///< not NULL, phEvent must not refer to an element of the phEventWaitList array. ) { auto pfnEnqueueExp = getContext()->urDdiTable.CommandBufferExp.pfnEnqueueExp; @@ -8828,6 +9315,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( if (NULL == pUpdateKernelLaunch) { return UR_RESULT_ERROR_INVALID_NULL_POINTER; } + + if (pUpdateKernelLaunch->newWorkDim < 1 || + pUpdateKernelLaunch->newWorkDim > 3) { + return UR_RESULT_ERROR_INVALID_WORK_DIMENSION; + } } ur_result_t result = @@ -8836,6 +9328,81 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urCommandBufferUpdateSignalEventExp +__urdlllocal ur_result_t UR_APICALL urCommandBufferUpdateSignalEventExp( + ur_exp_command_buffer_command_handle_t + hCommand, ///< [in] Handle of the command-buffer command to update. + ur_event_handle_t *phSignalEvent ///< [out] Event to be signaled. +) { + auto pfnUpdateSignalEventExp = + getContext()->urDdiTable.CommandBufferExp.pfnUpdateSignalEventExp; + + if (nullptr == pfnUpdateSignalEventExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (getContext()->enableParameterValidation) { + if (NULL == hCommand) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + if (NULL == phSignalEvent) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + } + + ur_result_t result = pfnUpdateSignalEventExp(hCommand, phSignalEvent); + + return result; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urCommandBufferUpdateWaitEventsExp +__urdlllocal ur_result_t UR_APICALL urCommandBufferUpdateWaitEventsExp( + ur_exp_command_buffer_command_handle_t + hCommand, ///< [in] Handle of the command-buffer command to update. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. +) { + auto pfnUpdateWaitEventsExp = + getContext()->urDdiTable.CommandBufferExp.pfnUpdateWaitEventsExp; + + if (nullptr == pfnUpdateWaitEventsExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (getContext()->enableParameterValidation) { + if (NULL == hCommand) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + if (phEventWaitList == NULL && numEventsInWaitList > 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList == 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList > 0) { + for (uint32_t i = 0; i < numEventsInWaitList; ++i) { + if (phEventWaitList[i] == NULL) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + } + } + } + + ur_result_t result = + pfnUpdateWaitEventsExp(hCommand, numEventsInWaitList, phEventWaitList); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urCommandBufferGetInfoExp __urdlllocal ur_result_t UR_APICALL urCommandBufferGetInfoExp( @@ -8871,7 +9438,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferGetInfoExp( return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - if (UR_EXP_COMMAND_BUFFER_INFO_REFERENCE_COUNT < propName) { + if (UR_EXP_COMMAND_BUFFER_INFO_DESCRIPTOR < propName) { return UR_RESULT_ERROR_INVALID_ENUMERATION; } @@ -8955,17 +9522,16 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that ///< specify the number of local work-items forming a work-group that will ///< execute the kernel function. - ///< If nullptr, the runtime implementation will choose the work-group - ///< size. + ///< If nullptr, the runtime implementation will choose the work-group size. uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { auto pfnCooperativeKernelLaunchExp = getContext()->urDdiTable.EnqueueExp.pfnCooperativeKernelLaunchExp; @@ -9078,8 +9644,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait - ///< events. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [in,out] return an event object that identifies this particular kernel ///< execution instance. Profiling information can be queried @@ -9087,7 +9652,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` ///< reports the timestamp at the time of the call to this function. ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` - ///< reports the timestamp recorded when the command is executed on the device. + ///< reports the timestamp recorded when the command is executed on the + ///< device. If phEventWaitList and phEvent are not NULL, phEvent must not + ///< refer to an element of the phEventWaitList array. ) { auto pfnTimestampRecordingExp = getContext()->urDdiTable.EnqueueExp.pfnTimestampRecordingExp; @@ -9161,7 +9728,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( ///< the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList + ///< array. ) { auto pfnKernelLaunchCustomExp = getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchCustomExp; @@ -9579,7 +10148,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies the work that has - ///< been enqueued in nativeEnqueueFunc. + ///< been enqueued in nativeEnqueueFunc. If phEventWaitList and phEvent are + ///< not NULL, phEvent must not refer to an element of the phEventWaitList array. ) { auto pfnNativeCommandExp = getContext()->urDdiTable.EnqueueExp.pfnNativeCommandExp; @@ -9972,9 +10542,15 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable( pDdiTable->pfnMapExternalArrayExp = ur_validation_layer::urBindlessImagesMapExternalArrayExp; - dditable.pfnReleaseInteropExp = pDdiTable->pfnReleaseInteropExp; - pDdiTable->pfnReleaseInteropExp = - ur_validation_layer::urBindlessImagesReleaseInteropExp; + dditable.pfnMapExternalLinearMemoryExp = + pDdiTable->pfnMapExternalLinearMemoryExp; + pDdiTable->pfnMapExternalLinearMemoryExp = + ur_validation_layer::urBindlessImagesMapExternalLinearMemoryExp; + + dditable.pfnReleaseExternalMemoryExp = + pDdiTable->pfnReleaseExternalMemoryExp; + pDdiTable->pfnReleaseExternalMemoryExp = + ur_validation_layer::urBindlessImagesReleaseExternalMemoryExp; dditable.pfnImportExternalSemaphoreExp = pDdiTable->pfnImportExternalSemaphoreExp; @@ -10106,6 +10682,14 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( pDdiTable->pfnUpdateKernelLaunchExp = ur_validation_layer::urCommandBufferUpdateKernelLaunchExp; + dditable.pfnUpdateSignalEventExp = pDdiTable->pfnUpdateSignalEventExp; + pDdiTable->pfnUpdateSignalEventExp = + ur_validation_layer::urCommandBufferUpdateSignalEventExp; + + dditable.pfnUpdateWaitEventsExp = pDdiTable->pfnUpdateWaitEventsExp; + pDdiTable->pfnUpdateWaitEventsExp = + ur_validation_layer::urCommandBufferUpdateWaitEventsExp; + dditable.pfnGetInfoExp = pDdiTable->pfnGetInfoExp; pDdiTable->pfnGetInfoExp = ur_validation_layer::urCommandBufferGetInfoExp; @@ -11198,9 +11782,13 @@ ur_result_t context_t::init(ur_dditable_t *dditable, if (enabledLayerNames.count(nameFullValidation)) { enableParameterValidation = true; + enableBoundsChecking = true; enableLeakChecking = true; enableLifetimeValidation = true; } else { + if (enabledLayerNames.count(nameBoundsChecking)) { + enableBoundsChecking = true; + } if (enabledLayerNames.count(nameParameterValidation)) { enableParameterValidation = true; } @@ -11333,13 +11921,11 @@ ur_result_t context_t::init(ur_dditable_t *dditable, } ur_result_t context_t::tearDown() { - ur_result_t result = UR_RESULT_SUCCESS; - if (enableLeakChecking) { getContext()->refCountContext->logInvalidReferences(); - getContext()->refCountContext->clear(); } - return result; + + return UR_RESULT_SUCCESS; } } // namespace ur_validation_layer diff --git a/source/loader/layers/validation/ur_validation_layer.hpp b/source/loader/layers/validation/ur_validation_layer.hpp index aa3d4629b7..642829f7f0 100644 --- a/source/loader/layers/validation/ur_validation_layer.hpp +++ b/source/loader/layers/validation/ur_validation_layer.hpp @@ -24,6 +24,7 @@ class __urdlllocal context_t : public proxy_layer_context_t, public AtomicSingleton { public: bool enableParameterValidation = false; + bool enableBoundsChecking = false; bool enableLeakChecking = false; bool enableLifetimeValidation = false; logger::Logger logger; @@ -35,7 +36,7 @@ class __urdlllocal context_t : public proxy_layer_context_t, static std::vector getNames() { return {nameFullValidation, nameParameterValidation, nameLeakChecking, - nameLifetimeValidation}; + nameBoundsChecking, nameLifetimeValidation}; } ur_result_t init(ur_dditable_t *dditable, const std::set &enabledLayerNames, @@ -49,6 +50,8 @@ class __urdlllocal context_t : public proxy_layer_context_t, "UR_LAYER_FULL_VALIDATION"; inline static const std::string nameParameterValidation = "UR_LAYER_PARAMETER_VALIDATION"; + inline static const std::string nameBoundsChecking = + "UR_LAYER_BOUNDS_CHECKING"; inline static const std::string nameLeakChecking = "UR_LAYER_LEAK_CHECKING"; inline static const std::string nameLifetimeValidation = "UR_LAYER_LIFETIME_VALIDATION"; diff --git a/source/loader/loader.def.in b/source/loader/loader.def.in index e096e778ae..7ab3df9061 100644 --- a/source/loader/loader.def.in +++ b/source/loader/loader.def.in @@ -12,10 +12,11 @@ EXPORTS urBindlessImagesImportExternalMemoryExp urBindlessImagesImportExternalSemaphoreExp urBindlessImagesMapExternalArrayExp + urBindlessImagesMapExternalLinearMemoryExp urBindlessImagesMipmapFreeExp urBindlessImagesMipmapGetLevelExp + urBindlessImagesReleaseExternalMemoryExp urBindlessImagesReleaseExternalSemaphoreExp - urBindlessImagesReleaseInteropExp urBindlessImagesSampledImageCreateExp urBindlessImagesSampledImageHandleDestroyExp urBindlessImagesSignalExternalSemaphoreExp @@ -44,6 +45,8 @@ EXPORTS urCommandBufferRetainCommandExp urCommandBufferRetainExp urCommandBufferUpdateKernelLaunchExp + urCommandBufferUpdateSignalEventExp + urCommandBufferUpdateWaitEventsExp urContextCreate urContextCreateWithNativeHandle urContextGetInfo @@ -182,10 +185,11 @@ EXPORTS urPrintBindlessImagesImportExternalMemoryExpParams urPrintBindlessImagesImportExternalSemaphoreExpParams urPrintBindlessImagesMapExternalArrayExpParams + urPrintBindlessImagesMapExternalLinearMemoryExpParams urPrintBindlessImagesMipmapFreeExpParams urPrintBindlessImagesMipmapGetLevelExpParams + urPrintBindlessImagesReleaseExternalMemoryExpParams urPrintBindlessImagesReleaseExternalSemaphoreExpParams - urPrintBindlessImagesReleaseInteropExpParams urPrintBindlessImagesSampledImageCreateExpParams urPrintBindlessImagesSampledImageHandleDestroyExpParams urPrintBindlessImagesSignalExternalSemaphoreExpParams @@ -221,6 +225,8 @@ EXPORTS urPrintCommandBufferRetainCommandExpParams urPrintCommandBufferRetainExpParams urPrintCommandBufferUpdateKernelLaunchExpParams + urPrintCommandBufferUpdateSignalEventExpParams + urPrintCommandBufferUpdateWaitEventsExpParams urPrintContextCreateParams urPrintContextCreateWithNativeHandleParams urPrintContextFlags @@ -234,6 +240,7 @@ EXPORTS urPrintContextSetExtendedDeleterParams urPrintDeviceAffinityDomainFlags urPrintDeviceBinary + urPrintDeviceCommandBufferUpdateCapabilityFlags urPrintDeviceCreateWithNativeHandleParams urPrintDeviceExecCapabilityFlags urPrintDeviceFpCapabilityFlags @@ -306,13 +313,13 @@ EXPORTS urPrintExpCommandBufferUpdateValueArgDesc urPrintExpEnqueueNativeCommandFlags urPrintExpEnqueueNativeCommandProperties + urPrintExpExternalMemDesc urPrintExpExternalMemType + urPrintExpExternalSemaphoreDesc urPrintExpExternalSemaphoreType urPrintExpFileDescriptor urPrintExpImageCopyFlags urPrintExpImageCopyRegion - urPrintExpInteropMemDesc - urPrintExpInteropSemaphoreDesc urPrintExpLaunchProperty urPrintExpLaunchPropertyId urPrintExpPeerInfo diff --git a/source/loader/loader.map.in b/source/loader/loader.map.in index 6f55699cfe..a638186571 100644 --- a/source/loader/loader.map.in +++ b/source/loader/loader.map.in @@ -12,10 +12,11 @@ urBindlessImagesImportExternalMemoryExp; urBindlessImagesImportExternalSemaphoreExp; urBindlessImagesMapExternalArrayExp; + urBindlessImagesMapExternalLinearMemoryExp; urBindlessImagesMipmapFreeExp; urBindlessImagesMipmapGetLevelExp; + urBindlessImagesReleaseExternalMemoryExp; urBindlessImagesReleaseExternalSemaphoreExp; - urBindlessImagesReleaseInteropExp; urBindlessImagesSampledImageCreateExp; urBindlessImagesSampledImageHandleDestroyExp; urBindlessImagesSignalExternalSemaphoreExp; @@ -44,6 +45,8 @@ urCommandBufferRetainCommandExp; urCommandBufferRetainExp; urCommandBufferUpdateKernelLaunchExp; + urCommandBufferUpdateSignalEventExp; + urCommandBufferUpdateWaitEventsExp; urContextCreate; urContextCreateWithNativeHandle; urContextGetInfo; @@ -182,10 +185,11 @@ urPrintBindlessImagesImportExternalMemoryExpParams; urPrintBindlessImagesImportExternalSemaphoreExpParams; urPrintBindlessImagesMapExternalArrayExpParams; + urPrintBindlessImagesMapExternalLinearMemoryExpParams; urPrintBindlessImagesMipmapFreeExpParams; urPrintBindlessImagesMipmapGetLevelExpParams; + urPrintBindlessImagesReleaseExternalMemoryExpParams; urPrintBindlessImagesReleaseExternalSemaphoreExpParams; - urPrintBindlessImagesReleaseInteropExpParams; urPrintBindlessImagesSampledImageCreateExpParams; urPrintBindlessImagesSampledImageHandleDestroyExpParams; urPrintBindlessImagesSignalExternalSemaphoreExpParams; @@ -221,6 +225,8 @@ urPrintCommandBufferRetainCommandExpParams; urPrintCommandBufferRetainExpParams; urPrintCommandBufferUpdateKernelLaunchExpParams; + urPrintCommandBufferUpdateSignalEventExpParams; + urPrintCommandBufferUpdateWaitEventsExpParams; urPrintContextCreateParams; urPrintContextCreateWithNativeHandleParams; urPrintContextFlags; @@ -234,6 +240,7 @@ urPrintContextSetExtendedDeleterParams; urPrintDeviceAffinityDomainFlags; urPrintDeviceBinary; + urPrintDeviceCommandBufferUpdateCapabilityFlags; urPrintDeviceCreateWithNativeHandleParams; urPrintDeviceExecCapabilityFlags; urPrintDeviceFpCapabilityFlags; @@ -306,13 +313,13 @@ urPrintExpCommandBufferUpdateValueArgDesc; urPrintExpEnqueueNativeCommandFlags; urPrintExpEnqueueNativeCommandProperties; + urPrintExpExternalMemDesc; urPrintExpExternalMemType; + urPrintExpExternalSemaphoreDesc; urPrintExpExternalSemaphoreType; urPrintExpFileDescriptor; urPrintExpImageCopyFlags; urPrintExpImageCopyRegion; - urPrintExpInteropMemDesc; - urPrintExpInteropSemaphoreDesc; urPrintExpLaunchProperty; urPrintExpLaunchPropertyId; urPrintExpPeerInfo; diff --git a/source/loader/ur_adapter_registry.hpp b/source/loader/ur_adapter_registry.hpp index 25cd9a9fff..7df799ab1e 100644 --- a/source/loader/ur_adapter_registry.hpp +++ b/source/loader/ur_adapter_registry.hpp @@ -33,6 +33,14 @@ class AdapterRegistry { if (forceLoadedAdaptersOpt.has_value()) { for (const auto &s : forceLoadedAdaptersOpt.value()) { auto path = fs::path(s); + if (path.filename().extension() == STATIC_LIBRARY_EXTENSION) { + logger::warning( + "UR_ADAPTERS_FORCE_LOAD contains a path to a static" + "library {}, it will be skipped", + s); + continue; + } + bool exists = false; try { exists = fs::exists(path); @@ -41,11 +49,12 @@ class AdapterRegistry { } if (exists) { + forceLoaded = true; adaptersLoadPaths.emplace_back( std::vector{std::move(path)}); } else { logger::warning( - "Detected nonexistent path {} in environmental " + "Detected nonexistent path {} in environment " "variable UR_ADAPTERS_FORCE_LOAD", s); } @@ -92,6 +101,8 @@ class AdapterRegistry { size_t size() const noexcept { return adaptersLoadPaths.size(); } + bool adaptersForceLoaded() { return forceLoaded; } + std::vector>::const_iterator begin() const noexcept { return adaptersLoadPaths.begin(); } @@ -152,10 +163,123 @@ class AdapterRegistry { return paths.empty() ? std::nullopt : std::optional(paths); } + ur_result_t readPreFilterODS(std::string platformBackendName) { + // TODO: Refactor this to the common code such that both the prefilter and urDeviceGetSelected use the same functionality. + bool acceptLibrary = true; + std::optional odsEnvMap; + try { + odsEnvMap = getenv_to_map("ONEAPI_DEVICE_SELECTOR", false); + + } catch (...) { + // If the selector is malformed, then we ignore selector and return success. + logger::error("ERROR: missing backend, format of filter = " + "'[!]backend:filterStrings'"); + return UR_RESULT_SUCCESS; + } + logger::debug( + "getenv_to_map parsed env var and {} a map", + (odsEnvMap.has_value() ? "produced" : "failed to produce")); + + // if the ODS env var is not set at all, then pretend it was set to the default + using EnvVarMap = std::map>; + EnvVarMap mapODS = + odsEnvMap.has_value() ? odsEnvMap.value() : EnvVarMap{{"*", {"*"}}}; + for (auto &termPair : mapODS) { + std::string backend = termPair.first; + // TODO: Figure out how to process all ODS errors rather than returning + // on the first error. + if (backend.empty()) { + // FIXME: never true because getenv_to_map rejects this case + // malformed term: missing backend -- output ERROR, then continue + logger::error("ERROR: missing backend, format of filter = " + "'[!]backend:filterStrings'"); + continue; + } + logger::debug("ONEAPI_DEVICE_SELECTOR Pre-Filter with backend '{}' " + "and platform library name '{}'", + backend, platformBackendName); + enum FilterType { + AcceptFilter, + DiscardFilter, + } termType = + (backend.front() != '!') ? AcceptFilter : DiscardFilter; + logger::debug( + "termType is {}", + (termType != AcceptFilter ? "DiscardFilter" : "AcceptFilter")); + if (termType != AcceptFilter) { + logger::debug("DEBUG: backend was '{}'", backend); + backend.erase(backend.cbegin()); + logger::debug("DEBUG: backend now '{}'", backend); + } + + // Verify that the backend string is valid, otherwise ignore the backend. + if ((strcmp(backend.c_str(), "*") != 0) && + (strcmp(backend.c_str(), "level_zero") != 0) && + (strcmp(backend.c_str(), "opencl") != 0) && + (strcmp(backend.c_str(), "cuda") != 0) && + (strcmp(backend.c_str(), "hip") != 0)) { + logger::debug("ONEAPI_DEVICE_SELECTOR Pre-Filter with illegal " + "backend '{}' ", + backend); + continue; + } + + // case-insensitive comparison by converting both tolower + std::transform(platformBackendName.begin(), + platformBackendName.end(), + platformBackendName.begin(), + [](unsigned char c) { return std::tolower(c); }); + std::transform(backend.begin(), backend.end(), backend.begin(), + [](unsigned char c) { return std::tolower(c); }); + std::size_t nameFound = platformBackendName.find(backend); + + bool backendFound = nameFound != std::string::npos; + if (termType == AcceptFilter) { + if (backend.front() != '*' && !backendFound) { + logger::debug( + "The ONEAPI_DEVICE_SELECTOR backend name '{}' was not " + "found in the platform library name '{}'", + backend, platformBackendName); + acceptLibrary = false; + continue; + } else if (backend.front() == '*' || backendFound) { + return UR_RESULT_SUCCESS; + } + } else { + if (backendFound || backend.front() == '*') { + acceptLibrary = false; + logger::debug( + "The ONEAPI_DEVICE_SELECTOR backend name for discard " + "'{}' was found in the platform library name '{}'", + backend, platformBackendName); + continue; + } + } + } + if (acceptLibrary) { + return UR_RESULT_SUCCESS; + } + return UR_RESULT_ERROR_INVALID_VALUE; + } + void discoverKnownAdapters() { auto searchPathsEnvOpt = getEnvAdapterSearchPaths(); auto loaderLibPathOpt = getLoaderLibPath(); +#if defined(_WIN32) + bool loaderPreFilter = getenv_tobool("UR_LOADER_PRELOAD_FILTER", false); +#else + bool loaderPreFilter = getenv_tobool("UR_LOADER_PRELOAD_FILTER", true); +#endif for (const auto &adapterName : knownAdapterNames) { + + if (loaderPreFilter) { + if (readPreFilterODS(adapterName) != UR_RESULT_SUCCESS) { + logger::debug("The adapter '{}' was removed based on the " + "pre-filter from ONEAPI_DEVICE_SELECTOR.", + adapterName); + continue; + } + } std::vector loadPaths; // Adapter search order: @@ -183,6 +307,8 @@ class AdapterRegistry { } } + bool forceLoaded = false; + public: void enableMock() { adaptersLoadPaths.clear(); diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index 1b87948b53..331649cb63 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -24,7 +24,7 @@ __urdlllocal ur_result_t UR_APICALL urAdapterGet( ur_adapter_handle_t * phAdapters, ///< [out][optional][range(0, NumEntries)] array of handle of adapters. ///< If NumEntries is less than the number of adapters available, then - ///< ::urAdapterGet shall only retrieve that number of platforms. + ///< ::urAdapterGet shall only retrieve that number of adapters. uint32_t * pNumAdapters ///< [out][optional] returns the total number of adapters available. ) { @@ -764,7 +764,8 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetNativeHandle( __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_native_handle_t hNativeDevice, ///< [in][nocheck] the native handle of the device. - ur_platform_handle_t hPlatform, ///< [in] handle of the platform instance + ur_adapter_handle_t + hAdapter, ///< [in] handle of the adapter to which `hNativeDevice` belongs const ur_device_native_properties_t * pProperties, ///< [in][optional] pointer to native device properties struct. ur_device_handle_t @@ -775,8 +776,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( [[maybe_unused]] auto context = getContext(); // extract platform's function pointer table - auto dditable = - reinterpret_cast(hPlatform)->dditable; + auto dditable = reinterpret_cast(hAdapter)->dditable; auto pfnCreateWithNativeHandle = dditable->ur.Device.pfnCreateWithNativeHandle; if (nullptr == pfnCreateWithNativeHandle) { @@ -784,10 +784,10 @@ __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( } // convert loader handle to platform handle - hPlatform = reinterpret_cast(hPlatform)->handle; + hAdapter = reinterpret_cast(hAdapter)->handle; // forward to device-platform - result = pfnCreateWithNativeHandle(hNativeDevice, hPlatform, pProperties, + result = pfnCreateWithNativeHandle(hNativeDevice, hAdapter, pProperties, phDevice); if (UR_RESULT_SUCCESS != result) { @@ -3138,6 +3138,7 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgValue( *pProperties, ///< [in][optional] pointer to value properties. const void *pArgValue ///< [in] argument value represented as matching arg type. + ///< The data pointed to will be copied and therefore can be reused on return. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -4330,17 +4331,16 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that ///< specify the number of local work-items forming a work-group that will ///< execute the kernel function. - ///< If nullptr, the runtime implementation will choose the work-group - ///< size. + ///< If nullptr, the runtime implementation will choose the work-group size. uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -4405,7 +4405,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWait( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -4465,7 +4466,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -4531,7 +4533,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferRead( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -4602,7 +4605,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWrite( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -4683,7 +4687,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferReadRect( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -4768,7 +4773,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -4839,7 +4845,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopy( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -4921,7 +4928,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -4994,7 +5002,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferFill( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -5068,7 +5077,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageRead( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -5143,7 +5153,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageWrite( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -5219,7 +5230,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageCopy( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -5291,7 +5303,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferMap( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent, ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. void **ppRetMap ///< [out] return mapped pointer. TODO: move it before ///< numEventsInWaitList? ) { @@ -5359,7 +5372,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemUnmap( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -5429,7 +5443,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -5495,7 +5510,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -5559,7 +5575,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMPrefetch( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -5675,11 +5692,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill2D( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -5747,11 +5764,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -5816,11 +5833,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -5889,11 +5906,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -5968,9 +5985,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueReadHostPipe( ///< events that must be complete before the host pipe read. ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * - phEvent ///< [out][optional] returns an event object that identifies this read - ///< command + phEvent ///< [out][optional] returns an event object that identifies this read command ///< and can be used to query or queue a wait for this command to complete. + ///< If phEventWaitList and phEvent are not NULL, phEvent must not refer to + ///< an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -6047,6 +6065,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueWriteHostPipe( ur_event_handle_t * phEvent ///< [out][optional] returns an event object that identifies this write command ///< and can be used to query or queue a wait for this command to complete. + ///< If phEventWaitList and phEvent are not NULL, phEvent must not refer to + ///< an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -6394,7 +6414,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageCopyExp( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -6552,10 +6573,10 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( size_t size, ///< [in] size of the external memory ur_exp_external_mem_type_t memHandleType, ///< [in] type of external memory handle - ur_exp_interop_mem_desc_t - *pInteropMemDesc, ///< [in] the interop memory descriptor - ur_exp_interop_mem_handle_t - *phInteropMem ///< [out] interop memory handle to the external memory + ur_exp_external_mem_desc_t + *pExternalMemDesc, ///< [in] the external memory descriptor + ur_exp_external_mem_handle_t + *phExternalMem ///< [out] external memory handle to the external memory ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -6577,7 +6598,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( // forward to device-platform result = pfnImportExternalMemoryExp(hContext, hDevice, size, memHandleType, - pInteropMemDesc, phInteropMem); + pExternalMemDesc, phExternalMem); if (UR_RESULT_SUCCESS != result) { return result; @@ -6585,9 +6606,9 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( try { // convert platform handle to loader handle - *phInteropMem = reinterpret_cast( - context->factories.ur_exp_interop_mem_factory.getInstance( - *phInteropMem, dditable)); + *phExternalMem = reinterpret_cast( + context->factories.ur_exp_external_mem_factory.getInstance( + *phExternalMem, dditable)); } catch (std::bad_alloc &) { result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } @@ -6603,8 +6624,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( const ur_image_format_t *pImageFormat, ///< [in] pointer to image format specification const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description - ur_exp_interop_mem_handle_t - hInteropMem, ///< [in] interop memory handle to the external memory + ur_exp_external_mem_handle_t + hExternalMem, ///< [in] external memory handle to the external memory ur_exp_image_mem_native_handle_t * phImageMem ///< [out] image memory handle to the externally allocated memory ) { @@ -6627,12 +6648,12 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( hDevice = reinterpret_cast(hDevice)->handle; // convert loader handle to platform handle - hInteropMem = - reinterpret_cast(hInteropMem)->handle; + hExternalMem = + reinterpret_cast(hExternalMem)->handle; // forward to device-platform result = pfnMapExternalArrayExp(hContext, hDevice, pImageFormat, pImageDesc, - hInteropMem, phImageMem); + hExternalMem, phImageMem); if (UR_RESULT_SUCCESS != result) { return result; @@ -6642,12 +6663,52 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( } /////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urBindlessImagesReleaseInteropExp -__urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( +/// @brief Intercept function for urBindlessImagesMapExternalLinearMemoryExp +__urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp( + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_device_handle_t hDevice, ///< [in] handle of the device object + uint64_t offset, ///< [in] offset into memory region to map + uint64_t size, ///< [in] size of memory region to map + ur_exp_external_mem_handle_t + hExternalMem, ///< [in] external memory handle to the external memory + void **ppRetMem ///< [out] pointer of the externally allocated memory +) { + ur_result_t result = UR_RESULT_SUCCESS; + + [[maybe_unused]] auto context = getContext(); + + // extract platform's function pointer table + auto dditable = reinterpret_cast(hContext)->dditable; + auto pfnMapExternalLinearMemoryExp = + dditable->ur.BindlessImagesExp.pfnMapExternalLinearMemoryExp; + if (nullptr == pfnMapExternalLinearMemoryExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + // convert loader handle to platform handle + hContext = reinterpret_cast(hContext)->handle; + + // convert loader handle to platform handle + hDevice = reinterpret_cast(hDevice)->handle; + + // convert loader handle to platform handle + hExternalMem = + reinterpret_cast(hExternalMem)->handle; + + // forward to device-platform + result = pfnMapExternalLinearMemoryExp(hContext, hDevice, offset, size, + hExternalMem, ppRetMem); + + return result; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urBindlessImagesReleaseExternalMemoryExp +__urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp( ur_context_handle_t hContext, ///< [in] handle of the context object ur_device_handle_t hDevice, ///< [in] handle of the device object - ur_exp_interop_mem_handle_t - hInteropMem ///< [in][release] handle of interop memory to be destroyed + ur_exp_external_mem_handle_t + hExternalMem ///< [in][release] handle of external memory to be destroyed ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -6655,9 +6716,9 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( // extract platform's function pointer table auto dditable = reinterpret_cast(hContext)->dditable; - auto pfnReleaseInteropExp = - dditable->ur.BindlessImagesExp.pfnReleaseInteropExp; - if (nullptr == pfnReleaseInteropExp) { + auto pfnReleaseExternalMemoryExp = + dditable->ur.BindlessImagesExp.pfnReleaseExternalMemoryExp; + if (nullptr == pfnReleaseExternalMemoryExp) { return UR_RESULT_ERROR_UNINITIALIZED; } @@ -6668,11 +6729,11 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( hDevice = reinterpret_cast(hDevice)->handle; // convert loader handle to platform handle - hInteropMem = - reinterpret_cast(hInteropMem)->handle; + hExternalMem = + reinterpret_cast(hExternalMem)->handle; // forward to device-platform - result = pfnReleaseInteropExp(hContext, hDevice, hInteropMem); + result = pfnReleaseExternalMemoryExp(hContext, hDevice, hExternalMem); return result; } @@ -6684,10 +6745,10 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( ur_device_handle_t hDevice, ///< [in] handle of the device object ur_exp_external_semaphore_type_t semHandleType, ///< [in] type of external memory handle - ur_exp_interop_semaphore_desc_t - *pInteropSemaphoreDesc, ///< [in] the interop semaphore descriptor - ur_exp_interop_semaphore_handle_t * - phInteropSemaphore ///< [out] interop semaphore handle to the external semaphore + ur_exp_external_semaphore_desc_t + *pExternalSemaphoreDesc, ///< [in] the external semaphore descriptor + ur_exp_external_semaphore_handle_t * + phExternalSemaphore ///< [out] external semaphore handle to the external semaphore ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -6709,8 +6770,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( // forward to device-platform result = pfnImportExternalSemaphoreExp(hContext, hDevice, semHandleType, - pInteropSemaphoreDesc, - phInteropSemaphore); + pExternalSemaphoreDesc, + phExternalSemaphore); if (UR_RESULT_SUCCESS != result) { return result; @@ -6718,10 +6779,10 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( try { // convert platform handle to loader handle - *phInteropSemaphore = - reinterpret_cast( - context->factories.ur_exp_interop_semaphore_factory.getInstance( - *phInteropSemaphore, dditable)); + *phExternalSemaphore = + reinterpret_cast( + context->factories.ur_exp_external_semaphore_factory + .getInstance(*phExternalSemaphore, dditable)); } catch (std::bad_alloc &) { result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } @@ -6734,8 +6795,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( ur_context_handle_t hContext, ///< [in] handle of the context object ur_device_handle_t hDevice, ///< [in] handle of the device object - ur_exp_interop_semaphore_handle_t - hInteropSemaphore ///< [in][release] handle of interop semaphore to be destroyed + ur_exp_external_semaphore_handle_t + hExternalSemaphore ///< [in][release] handle of external semaphore to be destroyed ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -6756,13 +6817,13 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( hDevice = reinterpret_cast(hDevice)->handle; // convert loader handle to platform handle - hInteropSemaphore = - reinterpret_cast(hInteropSemaphore) - ->handle; + hExternalSemaphore = reinterpret_cast( + hExternalSemaphore) + ->handle; // forward to device-platform result = - pfnReleaseExternalSemaphoreExp(hContext, hDevice, hInteropSemaphore); + pfnReleaseExternalSemaphoreExp(hContext, hDevice, hExternalSemaphore); return result; } @@ -6771,8 +6832,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( /// @brief Intercept function for urBindlessImagesWaitExternalSemaphoreExp __urdlllocal ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( ur_queue_handle_t hQueue, ///< [in] handle of the queue object - ur_exp_interop_semaphore_handle_t - hSemaphore, ///< [in] interop semaphore handle + ur_exp_external_semaphore_handle_t + hSemaphore, ///< [in] external semaphore handle bool hasWaitValue, ///< [in] indicates whether the samephore is capable and should wait on a ///< certain value. @@ -6788,7 +6849,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -6807,7 +6869,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( // convert loader handle to platform handle hSemaphore = - reinterpret_cast(hSemaphore) + reinterpret_cast(hSemaphore) ->handle; // convert loader handles to platform handles @@ -6845,8 +6907,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( /// @brief Intercept function for urBindlessImagesSignalExternalSemaphoreExp __urdlllocal ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( ur_queue_handle_t hQueue, ///< [in] handle of the queue object - ur_exp_interop_semaphore_handle_t - hSemaphore, ///< [in] interop semaphore handle + ur_exp_external_semaphore_handle_t + hSemaphore, ///< [in] external semaphore handle bool hasSignalValue, ///< [in] indicates whether the samephore is capable and should signal on a ///< certain value. @@ -6862,7 +6924,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -6881,7 +6944,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( // convert loader handle to platform handle hSemaphore = - reinterpret_cast(hSemaphore) + reinterpret_cast(hSemaphore) ->handle; // convert loader handles to platform handles @@ -7064,16 +7127,37 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( const size_t * pGlobalWorkSize, ///< [in] Global work size to use when executing kernel. const size_t * - pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. + pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. If this + ///< parameter is nullptr, then a local work size will be generated by the + ///< implementation. + uint32_t + numKernelAlternatives, ///< [in] The number of kernel alternatives provided in + ///< phKernelAlternatives. + ur_kernel_handle_t * + phKernelAlternatives, ///< [in][optional][range(0, numKernelAlternatives)] List of kernel handles + ///< that might be used to update the kernel in this + ///< command after the command-buffer is finalized. The default kernel + ///< `hKernel` is implicitly marked as an alternative. It's + ///< invalid to specify it as part of this list. uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * pSyncPoint, ///< [out][optional] Sync point associated with this command. - ur_exp_command_buffer_command_handle_t - *phCommand ///< [out][optional] Handle to this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t * + phCommand ///< [out][optional] Handle to this command. Only available if the + ///< command-buffer is updatable. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -7097,16 +7181,45 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( // convert loader handle to platform handle hKernel = reinterpret_cast(hKernel)->handle; + // convert loader handles to platform handles + auto phKernelAlternativesLocal = + std::vector(numKernelAlternatives); + for (size_t i = 0; i < numKernelAlternatives; ++i) { + phKernelAlternativesLocal[i] = + reinterpret_cast(phKernelAlternatives[i]) + ->handle; + } + + // convert loader handles to platform handles + auto phEventWaitListLocal = + std::vector(numEventsInWaitList); + for (size_t i = 0; i < numEventsInWaitList; ++i) { + phEventWaitListLocal[i] = + reinterpret_cast(phEventWaitList[i])->handle; + } + // forward to device-platform result = pfnAppendKernelLaunchExp( hCommandBuffer, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint, - phCommand); + pLocalWorkSize, numKernelAlternatives, phKernelAlternativesLocal.data(), + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitListLocal.data(), pSyncPoint, phEvent, phCommand); if (UR_RESULT_SUCCESS != result) { return result; } + try { + // convert platform handle to loader handle + if (nullptr != phEvent) { + *phEvent = reinterpret_cast( + context->factories.ur_event_factory.getInstance(*phEvent, + dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + try { // convert platform handle to loader handle if (nullptr != phCommand) { @@ -7135,8 +7248,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -7157,10 +7281,46 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( reinterpret_cast(hCommandBuffer) ->handle; + // convert loader handles to platform handles + auto phEventWaitListLocal = + std::vector(numEventsInWaitList); + for (size_t i = 0; i < numEventsInWaitList; ++i) { + phEventWaitListLocal[i] = + reinterpret_cast(phEventWaitList[i])->handle; + } + // forward to device-platform - result = pfnAppendUSMMemcpyExp(hCommandBuffer, pDst, pSrc, size, - numSyncPointsInWaitList, pSyncPointWaitList, - pSyncPoint); + result = pfnAppendUSMMemcpyExp( + hCommandBuffer, pDst, pSrc, size, numSyncPointsInWaitList, + pSyncPointWaitList, numEventsInWaitList, phEventWaitListLocal.data(), + pSyncPoint, phEvent, phCommand); + + if (UR_RESULT_SUCCESS != result) { + return result; + } + + try { + // convert platform handle to loader handle + if (nullptr != phEvent) { + *phEvent = reinterpret_cast( + context->factories.ur_event_factory.getInstance(*phEvent, + dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + try { + // convert platform handle to loader handle + if (nullptr != phCommand) { + *phCommand = + reinterpret_cast( + context->factories.ur_exp_command_buffer_command_factory + .getInstance(*phCommand, dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } return result; } @@ -7180,8 +7340,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -7202,10 +7373,46 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( reinterpret_cast(hCommandBuffer) ->handle; + // convert loader handles to platform handles + auto phEventWaitListLocal = + std::vector(numEventsInWaitList); + for (size_t i = 0; i < numEventsInWaitList; ++i) { + phEventWaitListLocal[i] = + reinterpret_cast(phEventWaitList[i])->handle; + } + // forward to device-platform - result = pfnAppendUSMFillExp(hCommandBuffer, pMemory, pPattern, patternSize, - size, numSyncPointsInWaitList, - pSyncPointWaitList, pSyncPoint); + result = pfnAppendUSMFillExp( + hCommandBuffer, pMemory, pPattern, patternSize, size, + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitListLocal.data(), pSyncPoint, phEvent, phCommand); + + if (UR_RESULT_SUCCESS != result) { + return result; + } + + try { + // convert platform handle to loader handle + if (nullptr != phEvent) { + *phEvent = reinterpret_cast( + context->factories.ur_event_factory.getInstance(*phEvent, + dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + try { + // convert platform handle to loader handle + if (nullptr != phCommand) { + *phCommand = + reinterpret_cast( + context->factories.ur_exp_command_buffer_command_factory + .getInstance(*phCommand, dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } return result; } @@ -7225,8 +7432,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -7253,10 +7471,46 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( // convert loader handle to platform handle hDstMem = reinterpret_cast(hDstMem)->handle; + // convert loader handles to platform handles + auto phEventWaitListLocal = + std::vector(numEventsInWaitList); + for (size_t i = 0; i < numEventsInWaitList; ++i) { + phEventWaitListLocal[i] = + reinterpret_cast(phEventWaitList[i])->handle; + } + // forward to device-platform result = pfnAppendMemBufferCopyExp( hCommandBuffer, hSrcMem, hDstMem, srcOffset, dstOffset, size, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitListLocal.data(), pSyncPoint, phEvent, phCommand); + + if (UR_RESULT_SUCCESS != result) { + return result; + } + + try { + // convert platform handle to loader handle + if (nullptr != phEvent) { + *phEvent = reinterpret_cast( + context->factories.ur_event_factory.getInstance(*phEvent, + dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + try { + // convert platform handle to loader handle + if (nullptr != phCommand) { + *phCommand = + reinterpret_cast( + context->factories.ur_exp_command_buffer_command_factory + .getInstance(*phCommand, dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } return result; } @@ -7276,8 +7530,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -7301,10 +7566,46 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( // convert loader handle to platform handle hBuffer = reinterpret_cast(hBuffer)->handle; + // convert loader handles to platform handles + auto phEventWaitListLocal = + std::vector(numEventsInWaitList); + for (size_t i = 0; i < numEventsInWaitList; ++i) { + phEventWaitListLocal[i] = + reinterpret_cast(phEventWaitList[i])->handle; + } + // forward to device-platform - result = pfnAppendMemBufferWriteExp(hCommandBuffer, hBuffer, offset, size, - pSrc, numSyncPointsInWaitList, - pSyncPointWaitList, pSyncPoint); + result = pfnAppendMemBufferWriteExp( + hCommandBuffer, hBuffer, offset, size, pSrc, numSyncPointsInWaitList, + pSyncPointWaitList, numEventsInWaitList, phEventWaitListLocal.data(), + pSyncPoint, phEvent, phCommand); + + if (UR_RESULT_SUCCESS != result) { + return result; + } + + try { + // convert platform handle to loader handle + if (nullptr != phEvent) { + *phEvent = reinterpret_cast( + context->factories.ur_event_factory.getInstance(*phEvent, + dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + try { + // convert platform handle to loader handle + if (nullptr != phCommand) { + *phCommand = + reinterpret_cast( + context->factories.ur_exp_command_buffer_command_factory + .getInstance(*phCommand, dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } return result; } @@ -7323,9 +7624,20 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. - ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. -) { + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. + ur_exp_command_buffer_sync_point_t * + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. +) { ur_result_t result = UR_RESULT_SUCCESS; [[maybe_unused]] auto context = getContext(); @@ -7348,10 +7660,46 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( // convert loader handle to platform handle hBuffer = reinterpret_cast(hBuffer)->handle; + // convert loader handles to platform handles + auto phEventWaitListLocal = + std::vector(numEventsInWaitList); + for (size_t i = 0; i < numEventsInWaitList; ++i) { + phEventWaitListLocal[i] = + reinterpret_cast(phEventWaitList[i])->handle; + } + // forward to device-platform - result = pfnAppendMemBufferReadExp(hCommandBuffer, hBuffer, offset, size, - pDst, numSyncPointsInWaitList, - pSyncPointWaitList, pSyncPoint); + result = pfnAppendMemBufferReadExp( + hCommandBuffer, hBuffer, offset, size, pDst, numSyncPointsInWaitList, + pSyncPointWaitList, numEventsInWaitList, phEventWaitListLocal.data(), + pSyncPoint, phEvent, phCommand); + + if (UR_RESULT_SUCCESS != result) { + return result; + } + + try { + // convert platform handle to loader handle + if (nullptr != phEvent) { + *phEvent = reinterpret_cast( + context->factories.ur_event_factory.getInstance(*phEvent, + dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + try { + // convert platform handle to loader handle + if (nullptr != phCommand) { + *phCommand = + reinterpret_cast( + context->factories.ur_exp_command_buffer_command_factory + .getInstance(*phCommand, dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } return result; } @@ -7378,8 +7726,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -7406,11 +7765,47 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( // convert loader handle to platform handle hDstMem = reinterpret_cast(hDstMem)->handle; + // convert loader handles to platform handles + auto phEventWaitListLocal = + std::vector(numEventsInWaitList); + for (size_t i = 0; i < numEventsInWaitList; ++i) { + phEventWaitListLocal[i] = + reinterpret_cast(phEventWaitList[i])->handle; + } + // forward to device-platform result = pfnAppendMemBufferCopyRectExp( hCommandBuffer, hSrcMem, hDstMem, srcOrigin, dstOrigin, region, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitListLocal.data(), pSyncPoint, phEvent, phCommand); + + if (UR_RESULT_SUCCESS != result) { + return result; + } + + try { + // convert platform handle to loader handle + if (nullptr != phEvent) { + *phEvent = reinterpret_cast( + context->factories.ur_event_factory.getInstance(*phEvent, + dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + try { + // convert platform handle to loader handle + if (nullptr != phCommand) { + *phCommand = + reinterpret_cast( + context->factories.ur_exp_command_buffer_command_factory + .getInstance(*phCommand, dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } return result; } @@ -7443,8 +7838,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -7468,11 +7874,47 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( // convert loader handle to platform handle hBuffer = reinterpret_cast(hBuffer)->handle; + // convert loader handles to platform handles + auto phEventWaitListLocal = + std::vector(numEventsInWaitList); + for (size_t i = 0; i < numEventsInWaitList; ++i) { + phEventWaitListLocal[i] = + reinterpret_cast(phEventWaitList[i])->handle; + } + // forward to device-platform result = pfnAppendMemBufferWriteRectExp( hCommandBuffer, hBuffer, bufferOffset, hostOffset, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitListLocal.data(), pSyncPoint, phEvent, phCommand); + + if (UR_RESULT_SUCCESS != result) { + return result; + } + + try { + // convert platform handle to loader handle + if (nullptr != phEvent) { + *phEvent = reinterpret_cast( + context->factories.ur_event_factory.getInstance(*phEvent, + dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + try { + // convert platform handle to loader handle + if (nullptr != phCommand) { + *phCommand = + reinterpret_cast( + context->factories.ur_exp_command_buffer_command_factory + .getInstance(*phCommand, dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } return result; } @@ -7503,8 +7945,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -7528,11 +7981,47 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( // convert loader handle to platform handle hBuffer = reinterpret_cast(hBuffer)->handle; + // convert loader handles to platform handles + auto phEventWaitListLocal = + std::vector(numEventsInWaitList); + for (size_t i = 0; i < numEventsInWaitList; ++i) { + phEventWaitListLocal[i] = + reinterpret_cast(phEventWaitList[i])->handle; + } + // forward to device-platform result = pfnAppendMemBufferReadRectExp( hCommandBuffer, hBuffer, bufferOffset, hostOffset, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitListLocal.data(), pSyncPoint, phEvent, phCommand); + + if (UR_RESULT_SUCCESS != result) { + return result; + } + + try { + // convert platform handle to loader handle + if (nullptr != phEvent) { + *phEvent = reinterpret_cast( + context->factories.ur_event_factory.getInstance(*phEvent, + dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + try { + // convert platform handle to loader handle + if (nullptr != phCommand) { + *phCommand = + reinterpret_cast( + context->factories.ur_exp_command_buffer_command_factory + .getInstance(*phCommand, dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } return result; } @@ -7553,8 +8042,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -7578,10 +8078,46 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( // convert loader handle to platform handle hBuffer = reinterpret_cast(hBuffer)->handle; + // convert loader handles to platform handles + auto phEventWaitListLocal = + std::vector(numEventsInWaitList); + for (size_t i = 0; i < numEventsInWaitList; ++i) { + phEventWaitListLocal[i] = + reinterpret_cast(phEventWaitList[i])->handle; + } + // forward to device-platform result = pfnAppendMemBufferFillExp( hCommandBuffer, hBuffer, pPattern, patternSize, offset, size, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitListLocal.data(), pSyncPoint, phEvent, phCommand); + + if (UR_RESULT_SUCCESS != result) { + return result; + } + + try { + // convert platform handle to loader handle + if (nullptr != phEvent) { + *phEvent = reinterpret_cast( + context->factories.ur_event_factory.getInstance(*phEvent, + dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + try { + // convert platform handle to loader handle + if (nullptr != phCommand) { + *phCommand = + reinterpret_cast( + context->factories.ur_exp_command_buffer_command_factory + .getInstance(*phCommand, dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } return result; } @@ -7599,8 +8135,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -7621,10 +8168,46 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( reinterpret_cast(hCommandBuffer) ->handle; + // convert loader handles to platform handles + auto phEventWaitListLocal = + std::vector(numEventsInWaitList); + for (size_t i = 0; i < numEventsInWaitList; ++i) { + phEventWaitListLocal[i] = + reinterpret_cast(phEventWaitList[i])->handle; + } + // forward to device-platform - result = pfnAppendUSMPrefetchExp(hCommandBuffer, pMemory, size, flags, - numSyncPointsInWaitList, - pSyncPointWaitList, pSyncPoint); + result = pfnAppendUSMPrefetchExp( + hCommandBuffer, pMemory, size, flags, numSyncPointsInWaitList, + pSyncPointWaitList, numEventsInWaitList, phEventWaitListLocal.data(), + pSyncPoint, phEvent, phCommand); + + if (UR_RESULT_SUCCESS != result) { + return result; + } + + try { + // convert platform handle to loader handle + if (nullptr != phEvent) { + *phEvent = reinterpret_cast( + context->factories.ur_event_factory.getInstance(*phEvent, + dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + try { + // convert platform handle to loader handle + if (nullptr != phCommand) { + *phCommand = + reinterpret_cast( + context->factories.ur_exp_command_buffer_command_factory + .getInstance(*phCommand, dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } return result; } @@ -7642,8 +8225,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -7664,10 +8258,46 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( reinterpret_cast(hCommandBuffer) ->handle; + // convert loader handles to platform handles + auto phEventWaitListLocal = + std::vector(numEventsInWaitList); + for (size_t i = 0; i < numEventsInWaitList; ++i) { + phEventWaitListLocal[i] = + reinterpret_cast(phEventWaitList[i])->handle; + } + // forward to device-platform - result = pfnAppendUSMAdviseExp(hCommandBuffer, pMemory, size, advice, - numSyncPointsInWaitList, pSyncPointWaitList, - pSyncPoint); + result = pfnAppendUSMAdviseExp( + hCommandBuffer, pMemory, size, advice, numSyncPointsInWaitList, + pSyncPointWaitList, numEventsInWaitList, phEventWaitListLocal.data(), + pSyncPoint, phEvent, phCommand); + + if (UR_RESULT_SUCCESS != result) { + return result; + } + + try { + // convert platform handle to loader handle + if (nullptr != phEvent) { + *phEvent = reinterpret_cast( + context->factories.ur_event_factory.getInstance(*phEvent, + dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + try { + // convert platform handle to loader handle + if (nullptr != phCommand) { + *phCommand = + reinterpret_cast( + context->factories.ur_exp_command_buffer_command_factory + .getInstance(*phCommand, dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } return result; } @@ -7686,7 +8316,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferEnqueueExp( ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command-buffer execution instance. + ///< command-buffer execution instance. If phEventWaitList and phEvent are + ///< not NULL, phEvent must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -7831,6 +8462,13 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( // Deal with any struct parameters that have handle members we need to convert. auto pUpdateKernelLaunchLocal = *pUpdateKernelLaunch; + if (pUpdateKernelLaunchLocal.hNewKernel) { + pUpdateKernelLaunchLocal.hNewKernel = + reinterpret_cast( + pUpdateKernelLaunchLocal.hNewKernel) + ->handle; + } + std::vector pUpdateKernelLaunchpNewMemObjArgList; for (uint32_t i = 0; i < pUpdateKernelLaunch->numNewMemObjArgs; i++) { @@ -7856,6 +8494,96 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urCommandBufferUpdateSignalEventExp +__urdlllocal ur_result_t UR_APICALL urCommandBufferUpdateSignalEventExp( + ur_exp_command_buffer_command_handle_t + hCommand, ///< [in] Handle of the command-buffer command to update. + ur_event_handle_t *phSignalEvent ///< [out] Event to be signaled. +) { + ur_result_t result = UR_RESULT_SUCCESS; + + [[maybe_unused]] auto context = getContext(); + + // extract platform's function pointer table + auto dditable = + reinterpret_cast(hCommand) + ->dditable; + auto pfnUpdateSignalEventExp = + dditable->ur.CommandBufferExp.pfnUpdateSignalEventExp; + if (nullptr == pfnUpdateSignalEventExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + // convert loader handle to platform handle + hCommand = + reinterpret_cast(hCommand) + ->handle; + + // forward to device-platform + result = pfnUpdateSignalEventExp(hCommand, phSignalEvent); + + if (UR_RESULT_SUCCESS != result) { + return result; + } + + try { + // convert platform handle to loader handle + *phSignalEvent = reinterpret_cast( + context->factories.ur_event_factory.getInstance(*phSignalEvent, + dditable)); + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + return result; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urCommandBufferUpdateWaitEventsExp +__urdlllocal ur_result_t UR_APICALL urCommandBufferUpdateWaitEventsExp( + ur_exp_command_buffer_command_handle_t + hCommand, ///< [in] Handle of the command-buffer command to update. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. +) { + ur_result_t result = UR_RESULT_SUCCESS; + + [[maybe_unused]] auto context = getContext(); + + // extract platform's function pointer table + auto dditable = + reinterpret_cast(hCommand) + ->dditable; + auto pfnUpdateWaitEventsExp = + dditable->ur.CommandBufferExp.pfnUpdateWaitEventsExp; + if (nullptr == pfnUpdateWaitEventsExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + // convert loader handle to platform handle + hCommand = + reinterpret_cast(hCommand) + ->handle; + + // convert loader handles to platform handles + auto phEventWaitListLocal = + std::vector(numEventsInWaitList); + for (size_t i = 0; i < numEventsInWaitList; ++i) { + phEventWaitListLocal[i] = + reinterpret_cast(phEventWaitList[i])->handle; + } + + // forward to device-platform + result = pfnUpdateWaitEventsExp(hCommand, numEventsInWaitList, + phEventWaitListLocal.data()); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urCommandBufferGetInfoExp __urdlllocal ur_result_t UR_APICALL urCommandBufferGetInfoExp( @@ -7956,17 +8684,16 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that ///< specify the number of local work-items forming a work-group that will ///< execute the kernel function. - ///< If nullptr, the runtime implementation will choose the work-group - ///< size. + ///< If nullptr, the runtime implementation will choose the work-group size. uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -8065,8 +8792,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait - ///< events. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [in,out] return an event object that identifies this particular kernel ///< execution instance. Profiling information can be queried @@ -8074,7 +8800,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` ///< reports the timestamp at the time of the call to this function. ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` - ///< reports the timestamp recorded when the command is executed on the device. + ///< reports the timestamp recorded when the command is executed on the + ///< device. If phEventWaitList and phEvent are not NULL, phEvent must not + ///< refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -8148,7 +8876,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( ///< the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList + ///< array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -8503,7 +9233,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies the work that has - ///< been enqueued in nativeEnqueueFunc. + ///< been enqueued in nativeEnqueueFunc. If phEventWaitList and phEvent are + ///< not NULL, phEvent must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; @@ -8733,6 +9464,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetGlobalProcAddrTable( // Load the device-platform DDI tables for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) { + continue; + } + if (platform.initStatus != UR_RESULT_SUCCESS) { continue; } @@ -8791,6 +9527,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable( // Load the device-platform DDI tables for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) { + continue; + } + if (platform.initStatus != UR_RESULT_SUCCESS) { continue; } @@ -8834,8 +9575,10 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable( ur_loader::urBindlessImagesImportExternalMemoryExp; pDdiTable->pfnMapExternalArrayExp = ur_loader::urBindlessImagesMapExternalArrayExp; - pDdiTable->pfnReleaseInteropExp = - ur_loader::urBindlessImagesReleaseInteropExp; + pDdiTable->pfnMapExternalLinearMemoryExp = + ur_loader::urBindlessImagesMapExternalLinearMemoryExp; + pDdiTable->pfnReleaseExternalMemoryExp = + ur_loader::urBindlessImagesReleaseExternalMemoryExp; pDdiTable->pfnImportExternalSemaphoreExp = ur_loader::urBindlessImagesImportExternalSemaphoreExp; pDdiTable->pfnReleaseExternalSemaphoreExp = @@ -8881,6 +9624,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( // Load the device-platform DDI tables for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) { + continue; + } + if (platform.initStatus != UR_RESULT_SUCCESS) { continue; } @@ -8935,6 +9683,10 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( ur_loader::urCommandBufferReleaseCommandExp; pDdiTable->pfnUpdateKernelLaunchExp = ur_loader::urCommandBufferUpdateKernelLaunchExp; + pDdiTable->pfnUpdateSignalEventExp = + ur_loader::urCommandBufferUpdateSignalEventExp; + pDdiTable->pfnUpdateWaitEventsExp = + ur_loader::urCommandBufferUpdateWaitEventsExp; pDdiTable->pfnGetInfoExp = ur_loader::urCommandBufferGetInfoExp; pDdiTable->pfnCommandGetInfoExp = ur_loader::urCommandBufferCommandGetInfoExp; @@ -8975,6 +9727,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetContextProcAddrTable( // Load the device-platform DDI tables for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) { + continue; + } + if (platform.initStatus != UR_RESULT_SUCCESS) { continue; } @@ -9036,6 +9793,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( // Load the device-platform DDI tables for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) { + continue; + } + if (platform.initStatus != UR_RESULT_SUCCESS) { continue; } @@ -9119,6 +9881,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( // Load the device-platform DDI tables for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) { + continue; + } + if (platform.initStatus != UR_RESULT_SUCCESS) { continue; } @@ -9181,6 +9948,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEventProcAddrTable( // Load the device-platform DDI tables for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) { + continue; + } + if (platform.initStatus != UR_RESULT_SUCCESS) { continue; } @@ -9242,6 +10014,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( // Load the device-platform DDI tables for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) { + continue; + } + if (platform.initStatus != UR_RESULT_SUCCESS) { continue; } @@ -9313,6 +10090,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable( // Load the device-platform DDI tables for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) { + continue; + } + if (platform.initStatus != UR_RESULT_SUCCESS) { continue; } @@ -9369,6 +10151,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetMemProcAddrTable( // Load the device-platform DDI tables for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) { + continue; + } + if (platform.initStatus != UR_RESULT_SUCCESS) { continue; } @@ -9433,6 +10220,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetPhysicalMemProcAddrTable( // Load the device-platform DDI tables for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) { + continue; + } + if (platform.initStatus != UR_RESULT_SUCCESS) { continue; } @@ -9490,6 +10282,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetPlatformProcAddrTable( // Load the device-platform DDI tables for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) { + continue; + } + if (platform.initStatus != UR_RESULT_SUCCESS) { continue; } @@ -9551,6 +10348,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramProcAddrTable( // Load the device-platform DDI tables for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) { + continue; + } + if (platform.initStatus != UR_RESULT_SUCCESS) { continue; } @@ -9622,6 +10424,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramExpProcAddrTable( // Load the device-platform DDI tables for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) { + continue; + } + if (platform.initStatus != UR_RESULT_SUCCESS) { continue; } @@ -9679,6 +10486,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( // Load the device-platform DDI tables for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) { + continue; + } + if (platform.initStatus != UR_RESULT_SUCCESS) { continue; } @@ -9740,6 +10552,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetSamplerProcAddrTable( // Load the device-platform DDI tables for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) { + continue; + } + if (platform.initStatus != UR_RESULT_SUCCESS) { continue; } @@ -9799,6 +10616,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetTensorMapExpProcAddrTable( // Load the device-platform DDI tables for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) { + continue; + } + if (platform.initStatus != UR_RESULT_SUCCESS) { continue; } @@ -9856,6 +10678,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMProcAddrTable( // Load the device-platform DDI tables for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) { + continue; + } + if (platform.initStatus != UR_RESULT_SUCCESS) { continue; } @@ -9917,6 +10744,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( // Load the device-platform DDI tables for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) { + continue; + } + if (platform.initStatus != UR_RESULT_SUCCESS) { continue; } @@ -9972,6 +10804,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetUsmP2PExpProcAddrTable( // Load the device-platform DDI tables for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) { + continue; + } + if (platform.initStatus != UR_RESULT_SUCCESS) { continue; } @@ -10032,6 +10869,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetVirtualMemProcAddrTable( // Load the device-platform DDI tables for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) { + continue; + } + if (platform.initStatus != UR_RESULT_SUCCESS) { continue; } @@ -10094,6 +10936,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( // Load the device-platform DDI tables for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) { + continue; + } + if (platform.initStatus != UR_RESULT_SUCCESS) { continue; } diff --git a/source/loader/ur_ldrddi.hpp b/source/loader/ur_ldrddi.hpp index 464eec90a7..309fb6cc65 100644 --- a/source/loader/ur_ldrddi.hpp +++ b/source/loader/ur_ldrddi.hpp @@ -65,16 +65,16 @@ using ur_usm_pool_object_t = object_t; using ur_usm_pool_factory_t = singleton_factory_t; -using ur_exp_interop_mem_object_t = object_t; -using ur_exp_interop_mem_factory_t = - singleton_factory_t; +using ur_exp_external_mem_object_t = object_t; +using ur_exp_external_mem_factory_t = + singleton_factory_t; -using ur_exp_interop_semaphore_object_t = - object_t; -using ur_exp_interop_semaphore_factory_t = - singleton_factory_t; +using ur_exp_external_semaphore_object_t = + object_t; +using ur_exp_external_semaphore_factory_t = + singleton_factory_t; using ur_exp_command_buffer_object_t = object_t; using ur_exp_command_buffer_factory_t = @@ -104,8 +104,8 @@ struct handle_factories { ur_mem_factory_t ur_mem_factory; ur_physical_mem_factory_t ur_physical_mem_factory; ur_usm_pool_factory_t ur_usm_pool_factory; - ur_exp_interop_mem_factory_t ur_exp_interop_mem_factory; - ur_exp_interop_semaphore_factory_t ur_exp_interop_semaphore_factory; + ur_exp_external_mem_factory_t ur_exp_external_mem_factory; + ur_exp_external_semaphore_factory_t ur_exp_external_semaphore_factory; ur_exp_command_buffer_factory_t ur_exp_command_buffer_factory; ur_exp_command_buffer_command_factory_t ur_exp_command_buffer_command_factory; diff --git a/source/loader/ur_lib.cpp b/source/loader/ur_lib.cpp index 2e04a2962d..e1de6d6237 100644 --- a/source/loader/ur_lib.cpp +++ b/source/loader/ur_lib.cpp @@ -49,15 +49,15 @@ void context_t::parseEnvEnabledLayers() { } } -void context_t::initLayers() const { +void context_t::initLayers() { for (auto &[layer, _] : layers) { - layer->init((ur_dditable_t *)&urDdiTable, enabledLayerNames, - codelocData); + layer->init(&urDdiTable, enabledLayerNames, codelocData); } } void context_t::tearDownLayers() const { - for (auto &[layer, destroy] : layers) { + for (auto it = layers.rbegin(); it != layers.rend(); ++it) { + auto [layer, destroy] = *it; layer->tearDown(); destroy(); } @@ -212,7 +212,10 @@ ur_result_t urLoaderTearDown() { delete context; }); - return ret == 0 ? UR_RESULT_SUCCESS : UR_RESULT_ERROR_UNINITIALIZED; + ur_result_t result = + ret == 0 ? UR_RESULT_SUCCESS : UR_RESULT_ERROR_UNINITIALIZED; + logger::info("---> urLoaderTearDown() -> {}", result); + return result; } ur_result_t @@ -556,19 +559,20 @@ ur_result_t urDeviceGetSelected(ur_platform_handle_t hPlatform, const auto thirdDeviceId = getDeviceId(thirdPart); deviceList.push_back(DeviceSpec{ DevicePartLevel::SUBSUB, hardwareType, firstDeviceId, - secondDeviceId, thirdDeviceId}); + secondDeviceId, thirdDeviceId, nullptr}); } else { // second dot not found, this is a subdevice - deviceList.push_back(DeviceSpec{DevicePartLevel::SUB, - hardwareType, firstDeviceId, - secondDeviceId}); + deviceList.push_back( + DeviceSpec{DevicePartLevel::SUB, hardwareType, + firstDeviceId, secondDeviceId, 0, nullptr}); } } else { // first dot not found, this is a root device const auto hardwareType = getRootHardwareType(filterString); const auto firstDeviceId = getDeviceId(filterString); deviceList.push_back(DeviceSpec{DevicePartLevel::ROOT, - hardwareType, firstDeviceId}); + hardwareType, firstDeviceId, 0, + 0, nullptr}); } } } @@ -583,8 +587,9 @@ ur_result_t urDeviceGetSelected(ur_platform_handle_t hPlatform, // for example, we pretend that "garbage:0;!cuda:*" was just "!cuda:*" // so we add an implicit accept-all term (equivalent to prepending "*:*;") // as we would have done if the user had given us the corrected string - acceptDeviceList.push_back(DeviceSpec{ - DevicePartLevel::ROOT, ::UR_DEVICE_TYPE_ALL, DeviceIdTypeALL}); + acceptDeviceList.push_back(DeviceSpec{DevicePartLevel::ROOT, + ::UR_DEVICE_TYPE_ALL, + DeviceIdTypeALL, 0, 0, nullptr}); } logger::debug("DEBUG: size of acceptDeviceList = {}", diff --git a/source/loader/ur_lib.hpp b/source/loader/ur_lib.hpp index edd0fffe9f..6dc6e53cc5 100644 --- a/source/loader/ur_lib.hpp +++ b/source/loader/ur_lib.hpp @@ -74,13 +74,15 @@ class __urdlllocal context_t : public AtomicSingleton { const std::vector layers = { {ur_validation_layer::getContext(), ur_validation_layer::context_t::forceDelete}, -#if UR_ENABLE_TRACING - {ur_tracing_layer::getContext(), - ur_tracing_layer::context_t::forceDelete}, -#endif + // Initialize tracing layer after sanitizer layer to make sure tracing + // layer will properly print all API calls. #if UR_ENABLE_SANITIZER {ur_sanitizer_layer::getContext(), ur_sanitizer_layer::context_t::forceDelete}, +#endif +#if UR_ENABLE_TRACING + {ur_tracing_layer::getContext(), + ur_tracing_layer::context_t::forceDelete}, #endif }; @@ -110,7 +112,7 @@ class __urdlllocal context_t : public AtomicSingleton { codeloc_data codelocData; void parseEnvEnabledLayers(); - void initLayers() const; + void initLayers(); void tearDownLayers() const; }; diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index a9c1af038f..1ccedfeb30 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -151,7 +151,7 @@ ur_result_t UR_APICALL urLoaderConfigEnableLayer( hLoaderConfig, ///< [in] Handle to config object the layer will be enabled for. const char * pLayerName ///< [in] Null terminated string containing the name of the layer to - ///< enable. + ///< enable. Empty if none are enabled. ) try { return ur_lib::urLoaderConfigEnableLayer(hLoaderConfig, pLayerName); } catch (...) { @@ -292,6 +292,7 @@ ur_result_t UR_APICALL urLoaderTearDown(void) try { /// - ::UR_RESULT_ERROR_DEVICE_LOST /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_SIZE +/// + `NumEntries == 0 && phAdapters != NULL` ur_result_t UR_APICALL urAdapterGet( uint32_t NumEntries, ///< [in] the number of adapters to be added to phAdapters. @@ -301,7 +302,7 @@ ur_result_t UR_APICALL urAdapterGet( ur_adapter_handle_t * phAdapters, ///< [out][optional][range(0, NumEntries)] array of handle of adapters. ///< If NumEntries is less than the number of adapters available, then - ///< ::urAdapterGet shall only retrieve that number of platforms. + ///< ::urAdapterGet shall only retrieve that number of adapters. uint32_t * pNumAdapters ///< [out][optional] returns the total number of adapters available. ) try { @@ -1135,7 +1136,7 @@ ur_result_t UR_APICALL urDeviceGetNativeHandle( /// - ::UR_RESULT_ERROR_DEVICE_LOST /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE -/// + `NULL == hPlatform` +/// + `NULL == hAdapter` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == phDevice` /// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE @@ -1143,7 +1144,8 @@ ur_result_t UR_APICALL urDeviceGetNativeHandle( ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_native_handle_t hNativeDevice, ///< [in][nocheck] the native handle of the device. - ur_platform_handle_t hPlatform, ///< [in] handle of the platform instance + ur_adapter_handle_t + hAdapter, ///< [in] handle of the adapter to which `hNativeDevice` belongs const ur_device_native_properties_t * pProperties, ///< [in][optional] pointer to native device properties struct. ur_device_handle_t @@ -1155,7 +1157,7 @@ ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( return UR_RESULT_ERROR_UNINITIALIZED; } - return pfnCreateWithNativeHandle(hNativeDevice, hPlatform, pProperties, + return pfnCreateWithNativeHandle(hNativeDevice, hAdapter, pProperties, phDevice); } catch (...) { return exceptionToResult(std::current_exception()); @@ -1504,8 +1506,7 @@ ur_result_t UR_APICALL urContextSetExtendedDeleter( /// /// @details /// - The primary ::ur_image_format_t that must be supported by all the -/// adapters are {UR_IMAGE_CHANNEL_ORDER_RGBA, -/// UR_IMAGE_CHANNEL_TYPE_UNORM_INT8}, +/// adapters are {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT8}, /// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT16}, /// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT8}, /// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT16}, @@ -3486,6 +3487,11 @@ ur_result_t UR_APICALL urProgramGetBuildInfo( /// + `NULL == pSpecConstants` /// - ::UR_RESULT_ERROR_INVALID_SIZE /// + `count == 0` +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// + A pSpecConstant entry contains a size that does not match that of the specialization constant in the module. +/// + A pSpecConstant entry contains a nullptr pValue. +/// - ::UR_RESULT_ERROR_INVALID_SPEC_ID +/// + Any id specified in a pSpecConstant entry is not a valid specialization constant identifier. ur_result_t UR_APICALL urProgramSetSpecializationConstants( ur_program_handle_t hProgram, ///< [in] handle of the Program object uint32_t count, ///< [in] the number of elements in the pSpecConstants array @@ -3646,6 +3652,7 @@ ur_result_t UR_APICALL urKernelSetArgValue( *pProperties, ///< [in][optional] pointer to value properties. const void *pArgValue ///< [in] argument value represented as matching arg type. + ///< The data pointed to will be copied and therefore can be reused on return. ) try { auto pfnSetArgValue = ur_lib::getContext()->urDdiTable.Kernel.pfnSetArgValue; @@ -3762,7 +3769,7 @@ ur_result_t UR_APICALL urKernelGetInfo( /// + `NULL == hKernel` /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE < propName` +/// + `::UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE < propName` ur_result_t UR_APICALL urKernelGetGroupInfo( ur_kernel_handle_t hKernel, ///< [in] handle of the Kernel object ur_device_handle_t hDevice, ///< [in] handle of the Device object @@ -4079,6 +4086,11 @@ ur_result_t UR_APICALL urKernelSetArgMemObj( /// + `count == 0` /// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE /// + If ::UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS query is false +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// + A pSpecConstant entry contains a size that does not match that of the specialization constant in the module. +/// + A pSpecConstant entry contains a nullptr pValue. +/// - ::UR_RESULT_ERROR_INVALID_SPEC_ID +/// + Any id specified in a pSpecConstant entry is not a valid specialization constant identifier. ur_result_t UR_APICALL urKernelSetSpecializationConstants( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object uint32_t count, ///< [in] the number of elements in the pSpecConstants array @@ -4926,17 +4938,16 @@ ur_result_t UR_APICALL urEnqueueKernelLaunch( pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that ///< specify the number of local work-items forming a work-group that will ///< execute the kernel function. - ///< If nullptr, the runtime implementation will choose the work-group - ///< size. + ///< If nullptr, the runtime implementation will choose the work-group size. uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) try { auto pfnKernelLaunch = ur_lib::getContext()->urDdiTable.Enqueue.pfnKernelLaunch; @@ -4993,7 +5004,8 @@ ur_result_t UR_APICALL urEnqueueEventsWait( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { auto pfnEventsWait = ur_lib::getContext()->urDdiTable.Enqueue.pfnEventsWait; if (nullptr == pfnEventsWait) { @@ -5049,7 +5061,8 @@ ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { auto pfnEventsWaitWithBarrier = ur_lib::getContext()->urDdiTable.Enqueue.pfnEventsWaitWithBarrier; @@ -5113,7 +5126,8 @@ ur_result_t UR_APICALL urEnqueueMemBufferRead( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { auto pfnMemBufferRead = ur_lib::getContext()->urDdiTable.Enqueue.pfnMemBufferRead; @@ -5179,7 +5193,8 @@ ur_result_t UR_APICALL urEnqueueMemBufferWrite( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { auto pfnMemBufferWrite = ur_lib::getContext()->urDdiTable.Enqueue.pfnMemBufferWrite; @@ -5265,7 +5280,8 @@ ur_result_t UR_APICALL urEnqueueMemBufferReadRect( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { auto pfnMemBufferReadRect = ur_lib::getContext()->urDdiTable.Enqueue.pfnMemBufferReadRect; @@ -5356,7 +5372,8 @@ ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { auto pfnMemBufferWriteRect = ur_lib::getContext()->urDdiTable.Enqueue.pfnMemBufferWriteRect; @@ -5419,7 +5436,8 @@ ur_result_t UR_APICALL urEnqueueMemBufferCopy( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { auto pfnMemBufferCopy = ur_lib::getContext()->urDdiTable.Enqueue.pfnMemBufferCopy; @@ -5498,7 +5516,8 @@ ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { auto pfnMemBufferCopyRect = ur_lib::getContext()->urDdiTable.Enqueue.pfnMemBufferCopyRect; @@ -5566,7 +5585,8 @@ ur_result_t UR_APICALL urEnqueueMemBufferFill( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { auto pfnMemBufferFill = ur_lib::getContext()->urDdiTable.Enqueue.pfnMemBufferFill; @@ -5637,7 +5657,8 @@ ur_result_t UR_APICALL urEnqueueMemImageRead( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { auto pfnMemImageRead = ur_lib::getContext()->urDdiTable.Enqueue.pfnMemImageRead; @@ -5709,7 +5730,8 @@ ur_result_t UR_APICALL urEnqueueMemImageWrite( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { auto pfnMemImageWrite = ur_lib::getContext()->urDdiTable.Enqueue.pfnMemImageWrite; @@ -5776,7 +5798,8 @@ ur_result_t UR_APICALL urEnqueueMemImageCopy( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { auto pfnMemImageCopy = ur_lib::getContext()->urDdiTable.Enqueue.pfnMemImageCopy; @@ -5847,7 +5870,8 @@ ur_result_t UR_APICALL urEnqueueMemBufferMap( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent, ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. void **ppRetMap ///< [out] return mapped pointer. TODO: move it before ///< numEventsInWaitList? ) try { @@ -5906,7 +5930,8 @@ ur_result_t UR_APICALL urEnqueueMemUnmap( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { auto pfnMemUnmap = ur_lib::getContext()->urDdiTable.Enqueue.pfnMemUnmap; if (nullptr == pfnMemUnmap) { @@ -5966,7 +5991,8 @@ ur_result_t UR_APICALL urEnqueueUSMFill( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { auto pfnUSMFill = ur_lib::getContext()->urDdiTable.Enqueue.pfnUSMFill; if (nullptr == pfnUSMFill) { @@ -6022,7 +6048,8 @@ ur_result_t UR_APICALL urEnqueueUSMMemcpy( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { auto pfnUSMMemcpy = ur_lib::getContext()->urDdiTable.Enqueue.pfnUSMMemcpy; if (nullptr == pfnUSMMemcpy) { @@ -6082,7 +6109,8 @@ ur_result_t UR_APICALL urEnqueueUSMPrefetch( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { auto pfnUSMPrefetch = ur_lib::getContext()->urDdiTable.Enqueue.pfnUSMPrefetch; @@ -6195,11 +6223,11 @@ ur_result_t UR_APICALL urEnqueueUSMFill2D( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) try { auto pfnUSMFill2D = ur_lib::getContext()->urDdiTable.Enqueue.pfnUSMFill2D; if (nullptr == pfnUSMFill2D) { @@ -6261,11 +6289,11 @@ ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) try { auto pfnUSMMemcpy2D = ur_lib::getContext()->urDdiTable.Enqueue.pfnUSMMemcpy2D; @@ -6316,11 +6344,11 @@ ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) try { auto pfnDeviceGlobalVariableWrite = ur_lib::getContext()->urDdiTable.Enqueue.pfnDeviceGlobalVariableWrite; @@ -6371,11 +6399,11 @@ ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) try { auto pfnDeviceGlobalVariableRead = ur_lib::getContext()->urDdiTable.Enqueue.pfnDeviceGlobalVariableRead; @@ -6431,9 +6459,10 @@ ur_result_t UR_APICALL urEnqueueReadHostPipe( ///< events that must be complete before the host pipe read. ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * - phEvent ///< [out][optional] returns an event object that identifies this read - ///< command + phEvent ///< [out][optional] returns an event object that identifies this read command ///< and can be used to query or queue a wait for this command to complete. + ///< If phEventWaitList and phEvent are not NULL, phEvent must not refer to + ///< an element of the phEventWaitList array. ) try { auto pfnReadHostPipe = ur_lib::getContext()->urDdiTable.Enqueue.pfnReadHostPipe; @@ -6491,6 +6520,8 @@ ur_result_t UR_APICALL urEnqueueWriteHostPipe( ur_event_handle_t * phEvent ///< [out][optional] returns an event object that identifies this write command ///< and can be used to query or queue a wait for this command to complete. + ///< If phEventWaitList and phEvent are not NULL, phEvent must not refer to + ///< an element of the phEventWaitList array. ) try { auto pfnWriteHostPipe = ur_lib::getContext()->urDdiTable.Enqueue.pfnWriteHostPipe; @@ -6886,7 +6917,8 @@ ur_result_t UR_APICALL urBindlessImagesImageCopyExp( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { auto pfnImageCopyExp = ur_lib::getContext()->urDdiTable.BindlessImagesExp.pfnImageCopyExp; @@ -7035,8 +7067,8 @@ ur_result_t UR_APICALL urBindlessImagesMipmapFreeExp( /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION /// + `::UR_EXP_EXTERNAL_MEM_TYPE_WIN32_NT_DX12_RESOURCE < memHandleType` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pInteropMemDesc` -/// + `NULL == phInteropMem` +/// + `NULL == pExternalMemDesc` +/// + `NULL == phExternalMem` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT @@ -7046,10 +7078,10 @@ ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( size_t size, ///< [in] size of the external memory ur_exp_external_mem_type_t memHandleType, ///< [in] type of external memory handle - ur_exp_interop_mem_desc_t - *pInteropMemDesc, ///< [in] the interop memory descriptor - ur_exp_interop_mem_handle_t - *phInteropMem ///< [out] interop memory handle to the external memory + ur_exp_external_mem_desc_t + *pExternalMemDesc, ///< [in] the external memory descriptor + ur_exp_external_mem_handle_t + *phExternalMem ///< [out] external memory handle to the external memory ) try { auto pfnImportExternalMemoryExp = ur_lib::getContext() @@ -7059,13 +7091,13 @@ ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( } return pfnImportExternalMemoryExp(hContext, hDevice, size, memHandleType, - pInteropMemDesc, phInteropMem); + pExternalMemDesc, phExternalMem); } catch (...) { return exceptionToResult(std::current_exception()); } /////////////////////////////////////////////////////////////////////////////// -/// @brief Map an interop memory handle to an image memory handle +/// @brief Map an external memory handle to an image memory handle /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -7075,7 +7107,7 @@ ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hContext` /// + `NULL == hDevice` -/// + `NULL == hInteropMem` +/// + `NULL == hExternalMem` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pImageFormat` /// + `NULL == pImageDesc` @@ -7093,8 +7125,8 @@ ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( const ur_image_format_t *pImageFormat, ///< [in] pointer to image format specification const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description - ur_exp_interop_mem_handle_t - hInteropMem, ///< [in] interop memory handle to the external memory + ur_exp_external_mem_handle_t + hExternalMem, ///< [in] external memory handle to the external memory ur_exp_image_mem_native_handle_t * phImageMem ///< [out] image memory handle to the externally allocated memory ) try { @@ -7106,13 +7138,55 @@ ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( } return pfnMapExternalArrayExp(hContext, hDevice, pImageFormat, pImageDesc, - hInteropMem, phImageMem); + hExternalMem, phImageMem); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Map an external memory handle to a device memory region described by +/// void* +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hContext` +/// + `NULL == hDevice` +/// + `NULL == hExternalMem` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == ppRetMem` +/// - ::UR_RESULT_ERROR_INVALID_CONTEXT +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE +/// - ::UR_RESULT_ERROR_INVALID_OPERATION +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp( + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_device_handle_t hDevice, ///< [in] handle of the device object + uint64_t offset, ///< [in] offset into memory region to map + uint64_t size, ///< [in] size of memory region to map + ur_exp_external_mem_handle_t + hExternalMem, ///< [in] external memory handle to the external memory + void **ppRetMem ///< [out] pointer of the externally allocated memory + ) try { + auto pfnMapExternalLinearMemoryExp = + ur_lib::getContext() + ->urDdiTable.BindlessImagesExp.pfnMapExternalLinearMemoryExp; + if (nullptr == pfnMapExternalLinearMemoryExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + return pfnMapExternalLinearMemoryExp(hContext, hDevice, offset, size, + hExternalMem, ppRetMem); } catch (...) { return exceptionToResult(std::current_exception()); } /////////////////////////////////////////////////////////////////////////////// -/// @brief Release interop memory +/// @brief Release external memory /// /// @remarks /// _Analogues_ @@ -7126,22 +7200,23 @@ ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hContext` /// + `NULL == hDevice` -/// + `NULL == hInteropMem` +/// + `NULL == hExternalMem` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE -ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( +ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp( ur_context_handle_t hContext, ///< [in] handle of the context object ur_device_handle_t hDevice, ///< [in] handle of the device object - ur_exp_interop_mem_handle_t - hInteropMem ///< [in][release] handle of interop memory to be destroyed + ur_exp_external_mem_handle_t + hExternalMem ///< [in][release] handle of external memory to be destroyed ) try { - auto pfnReleaseInteropExp = - ur_lib::getContext()->urDdiTable.BindlessImagesExp.pfnReleaseInteropExp; - if (nullptr == pfnReleaseInteropExp) { + auto pfnReleaseExternalMemoryExp = + ur_lib::getContext() + ->urDdiTable.BindlessImagesExp.pfnReleaseExternalMemoryExp; + if (nullptr == pfnReleaseExternalMemoryExp) { return UR_RESULT_ERROR_UNINITIALIZED; } - return pfnReleaseInteropExp(hContext, hDevice, hInteropMem); + return pfnReleaseExternalMemoryExp(hContext, hDevice, hExternalMem); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -7164,8 +7239,8 @@ ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION /// + `::UR_EXP_EXTERNAL_SEMAPHORE_TYPE_WIN32_NT_DX12_FENCE < semHandleType` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pInteropSemaphoreDesc` -/// + `NULL == phInteropSemaphore` +/// + `NULL == pExternalSemaphoreDesc` +/// + `NULL == phExternalSemaphore` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( @@ -7173,10 +7248,10 @@ ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( ur_device_handle_t hDevice, ///< [in] handle of the device object ur_exp_external_semaphore_type_t semHandleType, ///< [in] type of external memory handle - ur_exp_interop_semaphore_desc_t - *pInteropSemaphoreDesc, ///< [in] the interop semaphore descriptor - ur_exp_interop_semaphore_handle_t * - phInteropSemaphore ///< [out] interop semaphore handle to the external semaphore + ur_exp_external_semaphore_desc_t + *pExternalSemaphoreDesc, ///< [in] the external semaphore descriptor + ur_exp_external_semaphore_handle_t * + phExternalSemaphore ///< [out] external semaphore handle to the external semaphore ) try { auto pfnImportExternalSemaphoreExp = ur_lib::getContext() @@ -7186,8 +7261,8 @@ ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( } return pfnImportExternalSemaphoreExp(hContext, hDevice, semHandleType, - pInteropSemaphoreDesc, - phInteropSemaphore); + pExternalSemaphoreDesc, + phExternalSemaphore); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -7207,14 +7282,14 @@ ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hContext` /// + `NULL == hDevice` -/// + `NULL == hInteropSemaphore` +/// + `NULL == hExternalSemaphore` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( ur_context_handle_t hContext, ///< [in] handle of the context object ur_device_handle_t hDevice, ///< [in] handle of the device object - ur_exp_interop_semaphore_handle_t - hInteropSemaphore ///< [in][release] handle of interop semaphore to be destroyed + ur_exp_external_semaphore_handle_t + hExternalSemaphore ///< [in][release] handle of external semaphore to be destroyed ) try { auto pfnReleaseExternalSemaphoreExp = ur_lib::getContext() @@ -7223,7 +7298,8 @@ ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( return UR_RESULT_ERROR_UNINITIALIZED; } - return pfnReleaseExternalSemaphoreExp(hContext, hDevice, hInteropSemaphore); + return pfnReleaseExternalSemaphoreExp(hContext, hDevice, + hExternalSemaphore); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -7247,8 +7323,8 @@ ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( /// - ::UR_RESULT_ERROR_INVALID_VALUE ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( ur_queue_handle_t hQueue, ///< [in] handle of the queue object - ur_exp_interop_semaphore_handle_t - hSemaphore, ///< [in] interop semaphore handle + ur_exp_external_semaphore_handle_t + hSemaphore, ///< [in] external semaphore handle bool hasWaitValue, ///< [in] indicates whether the samephore is capable and should wait on a ///< certain value. @@ -7264,7 +7340,8 @@ ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { auto pfnWaitExternalSemaphoreExp = ur_lib::getContext() @@ -7300,8 +7377,8 @@ ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( /// - ::UR_RESULT_ERROR_INVALID_VALUE ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( ur_queue_handle_t hQueue, ///< [in] handle of the queue object - ur_exp_interop_semaphore_handle_t - hSemaphore, ///< [in] interop semaphore handle + ur_exp_external_semaphore_handle_t + hSemaphore, ///< [in] external semaphore handle bool hasSignalValue, ///< [in] indicates whether the samephore is capable and should signal on a ///< certain value. @@ -7317,7 +7394,8 @@ ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) try { auto pfnSignalExternalSemaphoreExp = ur_lib::getContext() @@ -7479,12 +7557,23 @@ ur_result_t UR_APICALL urCommandBufferFinalizeExp( /// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION /// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE /// - ::UR_RESULT_ERROR_INVALID_VALUE +/// + `phKernelAlternatives == NULL && numKernelAlternatives > 0` +/// + `phKernelAlternatives != NULL && numKernelAlternatives == 0` +/// + If `phKernelAlternatives` contains `hKernel` /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +/// - ::UR_RESULT_ERROR_INVALID_OPERATION - "phCommand is not NULL and hCommandBuffer is not updatable." ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ur_exp_command_buffer_handle_t hCommandBuffer, ///< [in] Handle of the command-buffer object. @@ -7495,16 +7584,37 @@ ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( const size_t * pGlobalWorkSize, ///< [in] Global work size to use when executing kernel. const size_t * - pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. + pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. If this + ///< parameter is nullptr, then a local work size will be generated by the + ///< implementation. + uint32_t + numKernelAlternatives, ///< [in] The number of kernel alternatives provided in + ///< phKernelAlternatives. + ur_kernel_handle_t * + phKernelAlternatives, ///< [in][optional][range(0, numKernelAlternatives)] List of kernel handles + ///< that might be used to update the kernel in this + ///< command after the command-buffer is finalized. The default kernel + ///< `hKernel` is implicitly marked as an alternative. It's + ///< invalid to specify it as part of this list. uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * pSyncPoint, ///< [out][optional] Sync point associated with this command. - ur_exp_command_buffer_command_handle_t - *phCommand ///< [out][optional] Handle to this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t * + phCommand ///< [out][optional] Handle to this command. Only available if the + ///< command-buffer is updatable. ) try { auto pfnAppendKernelLaunchExp = ur_lib::getContext() @@ -7513,10 +7623,11 @@ ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( return UR_RESULT_ERROR_UNINITIALIZED; } - return pfnAppendKernelLaunchExp(hCommandBuffer, hKernel, workDim, - pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numSyncPointsInWaitList, - pSyncPointWaitList, pSyncPoint, phCommand); + return pfnAppendKernelLaunchExp( + hCommandBuffer, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, numKernelAlternatives, phKernelAlternatives, + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -7543,6 +7654,13 @@ ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( @@ -7556,8 +7674,19 @@ ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) try { auto pfnAppendUSMMemcpyExp = ur_lib::getContext()->urDdiTable.CommandBufferExp.pfnAppendUSMMemcpyExp; @@ -7567,7 +7696,8 @@ ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( return pfnAppendUSMMemcpyExp(hCommandBuffer, pDst, pSrc, size, numSyncPointsInWaitList, pSyncPointWaitList, - pSyncPoint); + numEventsInWaitList, phEventWaitList, + pSyncPoint, phEvent, phCommand); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -7596,6 +7726,13 @@ ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( @@ -7611,8 +7748,19 @@ ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) try { auto pfnAppendUSMFillExp = ur_lib::getContext()->urDdiTable.CommandBufferExp.pfnAppendUSMFillExp; @@ -7622,7 +7770,8 @@ ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( return pfnAppendUSMFillExp(hCommandBuffer, pMemory, pPattern, patternSize, size, numSyncPointsInWaitList, - pSyncPointWaitList, pSyncPoint); + pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -7645,6 +7794,13 @@ ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( @@ -7660,8 +7816,19 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) try { auto pfnAppendMemBufferCopyExp = ur_lib::getContext() @@ -7672,7 +7839,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( return pfnAppendMemBufferCopyExp( hCommandBuffer, hSrcMem, hDstMem, srcOffset, dstOffset, size, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -7696,6 +7864,13 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( @@ -7711,8 +7886,19 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) try { auto pfnAppendMemBufferWriteExp = ur_lib::getContext() @@ -7721,9 +7907,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( return UR_RESULT_ERROR_UNINITIALIZED; } - return pfnAppendMemBufferWriteExp(hCommandBuffer, hBuffer, offset, size, - pSrc, numSyncPointsInWaitList, - pSyncPointWaitList, pSyncPoint); + return pfnAppendMemBufferWriteExp( + hCommandBuffer, hBuffer, offset, size, pSrc, numSyncPointsInWaitList, + pSyncPointWaitList, numEventsInWaitList, phEventWaitList, pSyncPoint, + phEvent, phCommand); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -7747,6 +7934,13 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( @@ -7761,8 +7955,19 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) try { auto pfnAppendMemBufferReadExp = ur_lib::getContext() @@ -7771,9 +7976,10 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( return UR_RESULT_ERROR_UNINITIALIZED; } - return pfnAppendMemBufferReadExp(hCommandBuffer, hBuffer, offset, size, - pDst, numSyncPointsInWaitList, - pSyncPointWaitList, pSyncPoint); + return pfnAppendMemBufferReadExp( + hCommandBuffer, hBuffer, offset, size, pDst, numSyncPointsInWaitList, + pSyncPointWaitList, numEventsInWaitList, phEventWaitList, pSyncPoint, + phEvent, phCommand); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -7796,6 +8002,13 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( @@ -7818,8 +8031,19 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) try { auto pfnAppendMemBufferCopyRectExp = ur_lib::getContext() @@ -7831,7 +8055,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( return pfnAppendMemBufferCopyRectExp( hCommandBuffer, hSrcMem, hDstMem, srcOrigin, dstOrigin, region, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -7855,6 +8080,13 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( @@ -7883,8 +8115,19 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) try { auto pfnAppendMemBufferWriteRectExp = ur_lib::getContext() @@ -7896,7 +8139,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( return pfnAppendMemBufferWriteRectExp( hCommandBuffer, hBuffer, bufferOffset, hostOffset, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -7920,6 +8164,13 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( @@ -7946,8 +8197,19 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) try { auto pfnAppendMemBufferReadRectExp = ur_lib::getContext() @@ -7959,7 +8221,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( return pfnAppendMemBufferReadRectExp( hCommandBuffer, hBuffer, bufferOffset, hostOffset, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -7985,6 +8248,13 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT /// - ::UR_RESULT_ERROR_INVALID_SIZE /// + If `offset + size` results in an out-of-bounds access. +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( @@ -8001,8 +8271,19 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) try { auto pfnAppendMemBufferFillExp = ur_lib::getContext() @@ -8013,7 +8294,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( return pfnAppendMemBufferFillExp( hCommandBuffer, hBuffer, pPattern, patternSize, offset, size, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); + numSyncPointsInWaitList, pSyncPointWaitList, numEventsInWaitList, + phEventWaitList, pSyncPoint, phEvent, phCommand); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -8046,6 +8328,13 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( /// - ::UR_RESULT_ERROR_INVALID_SIZE /// + `size == 0` /// + If `size` is higher than the allocation size of `pMemory` +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( @@ -8059,8 +8348,19 @@ ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) try { auto pfnAppendUSMPrefetchExp = ur_lib::getContext() @@ -8071,7 +8371,8 @@ ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( return pfnAppendUSMPrefetchExp(hCommandBuffer, pMemory, size, flags, numSyncPointsInWaitList, pSyncPointWaitList, - pSyncPoint); + numEventsInWaitList, phEventWaitList, + pSyncPoint, phEvent, phCommand); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -8104,6 +8405,13 @@ ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( /// - ::UR_RESULT_ERROR_INVALID_SIZE /// + `size == 0` /// + If `size` is higher than the allocation size of `pMemory` +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( @@ -8117,8 +8425,19 @@ ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) try { auto pfnAppendUSMAdviseExp = ur_lib::getContext()->urDdiTable.CommandBufferExp.pfnAppendUSMAdviseExp; @@ -8128,7 +8447,8 @@ ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( return pfnAppendUSMAdviseExp(hCommandBuffer, pMemory, size, advice, numSyncPointsInWaitList, pSyncPointWaitList, - pSyncPoint); + numEventsInWaitList, phEventWaitList, + pSyncPoint, phEvent, phCommand); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -8165,7 +8485,8 @@ ur_result_t UR_APICALL urCommandBufferEnqueueExp( ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command-buffer execution instance. + ///< command-buffer execution instance. If phEventWaitList and phEvent are + ///< not NULL, phEvent must not refer to an element of the phEventWaitList array. ) try { auto pfnEnqueueExp = ur_lib::getContext()->urDdiTable.CommandBufferExp.pfnEnqueueExp; @@ -8237,9 +8558,10 @@ ur_result_t UR_APICALL urCommandBufferReleaseCommandExp( } /////////////////////////////////////////////////////////////////////////////// -/// @brief Update a kernel launch command in a finalized command-buffer. This -/// entry-point is synchronous and may block if the command-buffer is -/// executing when the entry-point is called. +/// @brief Update a kernel launch command in a finalized command-buffer. +/// +/// @details +/// This entry-point is synchronous and may block if the command-buffer is executing when the entry-point is called. /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -8255,18 +8577,17 @@ ur_result_t UR_APICALL urCommandBufferReleaseCommandExp( /// - ::UR_RESULT_ERROR_INVALID_OPERATION /// + If ::ur_exp_command_buffer_desc_t::isUpdatable was not set to true on creation of the command buffer `hCommand` belongs to. /// + If the command-buffer `hCommand` belongs to has not been finalized. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and different from the work-dim used on creation of `hCommand`. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a non-NULL value and `pUpdateKernelLaunch->pNewGlobalWorkSize` is NULL. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a non-NULL value when `hCommand` was created with a NULL local work size. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a NULL value when `hCommand` was created with a non-NULL local work size. -/// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_COMMAND_HANDLE_EXP +/// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_COMMAND_HANDLE_EXP - "If `hCommand` is not a kernel execution command." /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT /// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX /// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION /// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION +/// + `pUpdateKernelLaunch->newWorkDim < 1 || pUpdateKernelLaunch->newWorkDim > 3` /// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE /// - ::UR_RESULT_ERROR_INVALID_VALUE +/// + If `pUpdateKernelLaunch->hNewKernel` was not passed to the `hKernel` or `phKernelAlternatives` parameters of ::urCommandBufferAppendKernelLaunchExp when this command was created. +/// + If `pUpdateKernelLaunch->newWorkDim` is different from the current workDim in `hCommand` and, pUpdateKernelLaunch->pNewGlobalWorkSize, or pUpdateKernelLaunch->pNewGlobalWorkOffset are nullptr. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( @@ -8287,6 +8608,95 @@ ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Get a new event that will be signaled the next time the command in the +/// command-buffer executes. +/// +/// @details +/// It is the users responsibility to release the returned `phSignalEvent`. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hCommand` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == phSignalEvent` +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_EVENTS is not supported by the device associated with `hCommand`. +/// - ::UR_RESULT_ERROR_INVALID_OPERATION +/// + If ::ur_exp_command_buffer_desc_t::isUpdatable was not set to true on creation of the command buffer `hCommand` belongs to. +/// + If the command-buffer `hCommand` belongs to has not been finalized. +/// + If no `phEvent` parameter was set on creation of the command associated with `hCommand`. +/// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_COMMAND_HANDLE_EXP +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +ur_result_t UR_APICALL urCommandBufferUpdateSignalEventExp( + ur_exp_command_buffer_command_handle_t + hCommand, ///< [in] Handle of the command-buffer command to update. + ur_event_handle_t *phSignalEvent ///< [out] Event to be signaled. + ) try { + auto pfnUpdateSignalEventExp = + ur_lib::getContext() + ->urDdiTable.CommandBufferExp.pfnUpdateSignalEventExp; + if (nullptr == pfnUpdateSignalEventExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + return pfnUpdateSignalEventExp(hCommand, phSignalEvent); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Set the list of wait events for a command to depend on to a list of +/// new events. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hCommand` +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_EVENTS is not supported by the device associated with `hCommand`. +/// - ::UR_RESULT_ERROR_INVALID_OPERATION +/// + If ::ur_exp_command_buffer_desc_t::isUpdatable was not set to true on creation of the command buffer `hCommand` belongs to. +/// + If the command-buffer `hCommand` belongs to has not been finalized. +/// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_COMMAND_HANDLE_EXP +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// + If `numEventsInWaitList` does not match the number of wait events set when the command associated with `hCommand` was created. +/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +ur_result_t UR_APICALL urCommandBufferUpdateWaitEventsExp( + ur_exp_command_buffer_command_handle_t + hCommand, ///< [in] Handle of the command-buffer command to update. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. + ) try { + auto pfnUpdateWaitEventsExp = + ur_lib::getContext() + ->urDdiTable.CommandBufferExp.pfnUpdateWaitEventsExp; + if (nullptr == pfnUpdateWaitEventsExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + return pfnUpdateWaitEventsExp(hCommand, numEventsInWaitList, + phEventWaitList); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Get command-buffer object information. /// @@ -8298,7 +8708,7 @@ ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hCommandBuffer` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_EXP_COMMAND_BUFFER_INFO_REFERENCE_COUNT < propName` +/// + `::UR_EXP_COMMAND_BUFFER_INFO_DESCRIPTOR < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -8426,17 +8836,16 @@ ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that ///< specify the number of local work-items forming a work-group that will ///< execute the kernel function. - ///< If nullptr, the runtime implementation will choose the work-group - ///< size. + ///< If nullptr, the runtime implementation will choose the work-group size. uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) try { auto pfnCooperativeKernelLaunchExp = ur_lib::getContext() @@ -8513,8 +8922,7 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait - ///< events. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [in,out] return an event object that identifies this particular kernel ///< execution instance. Profiling information can be queried @@ -8522,7 +8930,9 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` ///< reports the timestamp at the time of the call to this function. ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` - ///< reports the timestamp recorded when the command is executed on the device. + ///< reports the timestamp recorded when the command is executed on the + ///< device. If phEventWaitList and phEvent are not NULL, phEvent must not + ///< refer to an element of the phEventWaitList array. ) try { auto pfnTimestampRecordingExp = ur_lib::getContext()->urDdiTable.EnqueueExp.pfnTimestampRecordingExp; @@ -8609,7 +9019,9 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( ///< the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList + ///< array. ) try { auto pfnKernelLaunchCustomExp = ur_lib::getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchCustomExp; @@ -9039,7 +9451,8 @@ ur_result_t UR_APICALL urEnqueueNativeCommandExp( ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies the work that has - ///< been enqueued in nativeEnqueueFunc. + ///< been enqueued in nativeEnqueueFunc. If phEventWaitList and phEvent are + ///< not NULL, phEvent must not refer to an element of the phEventWaitList array. ) try { auto pfnNativeCommandExp = ur_lib::getContext()->urDdiTable.EnqueueExp.pfnNativeCommandExp; diff --git a/source/loader/ur_loader.cpp b/source/loader/ur_loader.cpp index bfc9da3e50..e5a2bdb34e 100644 --- a/source/loader/ur_loader.cpp +++ b/source/loader/ur_loader.cpp @@ -8,13 +8,36 @@ * */ #include "ur_loader.hpp" +#ifdef UR_STATIC_ADAPTER_LEVEL_ZERO +#include "adapters/level_zero/ur_interface_loader.hpp" +#endif namespace ur_loader { /////////////////////////////////////////////////////////////////////////////// context_t *getContext() { return context_t::get_direct(); } -/////////////////////////////////////////////////////////////////////////////// ur_result_t context_t::init() { +#ifdef _WIN32 + // Suppress system errors. + // Tells the system to not display the critical-error-handler message box. + // Instead, the system sends the error to the calling process. + // This is crucial for graceful handling of adapters that couldn't be + // loaded, e.g. due to missing native run-times. + // TODO: add reporting in case of an error. + // NOTE: we restore the old mode to not affect user app behavior. + // See https://github.com/intel/llvm/blob/sycl/sycl/ur_win_proxy_loader/ur_win_proxy_loader.cpp (preloadLibraries()) + UINT SavedMode = SetErrorMode(SEM_FAILCRITICALERRORS); +#endif + +#ifdef UR_STATIC_ADAPTER_LEVEL_ZERO + // If the adapters were force loaded, it means the user wants to use + // a specific adapter library. Don't load any static adapters. + if (!adapter_registry.adaptersForceLoaded()) { + auto &level_zero = platforms.emplace_back(nullptr); + ur::level_zero::urAdapterGetDdiTables(&level_zero.dditable.ur); + } +#endif + for (const auto &adapterPaths : adapter_registry) { for (const auto &path : adapterPaths) { auto handle = LibLoader::loadAdapterLibrary(path.string().c_str()); @@ -24,6 +47,10 @@ ur_result_t context_t::init() { } } } +#ifdef _WIN32 + // Restore system error handling. + (void)SetErrorMode(SavedMode); +#endif forceIntercept = getenv_tobool("UR_ENABLE_LOADER_INTERCEPT"); diff --git a/source/loader/ur_print.cpp b/source/loader/ur_print.cpp index 975a99b304..c3a31382ff 100644 --- a/source/loader/ur_print.cpp +++ b/source/loader/ur_print.cpp @@ -955,16 +955,16 @@ ur_result_t urPrintExpSamplerCubemapProperties( } ur_result_t -urPrintExpInteropMemDesc(const struct ur_exp_interop_mem_desc_t params, - char *buffer, const size_t buff_size, - size_t *out_size) { +urPrintExpExternalMemDesc(const struct ur_exp_external_mem_desc_t params, + char *buffer, const size_t buff_size, + size_t *out_size) { std::stringstream ss; ss << params; return str_copy(&ss, buffer, buff_size, out_size); } -ur_result_t urPrintExpInteropSemaphoreDesc( - const struct ur_exp_interop_semaphore_desc_t params, char *buffer, +ur_result_t urPrintExpExternalSemaphoreDesc( + const struct ur_exp_external_semaphore_desc_t params, char *buffer, const size_t buff_size, size_t *out_size) { std::stringstream ss; ss << params; @@ -980,6 +980,14 @@ urPrintExpImageCopyRegion(const struct ur_exp_image_copy_region_t params, return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintDeviceCommandBufferUpdateCapabilityFlags( + enum ur_device_command_buffer_update_capability_flag_t value, char *buffer, + const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << value; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintExpCommandBufferInfo(enum ur_exp_command_buffer_info_t value, char *buffer, const size_t buff_size, size_t *out_size) { @@ -1262,8 +1270,18 @@ ur_result_t urPrintBindlessImagesMapExternalArrayExpParams( return str_copy(&ss, buffer, buff_size, out_size); } -ur_result_t urPrintBindlessImagesReleaseInteropExpParams( - const struct ur_bindless_images_release_interop_exp_params_t *params, +ur_result_t urPrintBindlessImagesMapExternalLinearMemoryExpParams( + const struct ur_bindless_images_map_external_linear_memory_exp_params_t + *params, + char *buffer, const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + +ur_result_t urPrintBindlessImagesReleaseExternalMemoryExpParams( + const struct ur_bindless_images_release_external_memory_exp_params_t + *params, char *buffer, const size_t buff_size, size_t *out_size) { std::stringstream ss; ss << params; @@ -1469,6 +1487,22 @@ ur_result_t urPrintCommandBufferUpdateKernelLaunchExpParams( return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintCommandBufferUpdateSignalEventExpParams( + const struct ur_command_buffer_update_signal_event_exp_params_t *params, + char *buffer, const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + +ur_result_t urPrintCommandBufferUpdateWaitEventsExpParams( + const struct ur_command_buffer_update_wait_events_exp_params_t *params, + char *buffer, const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintCommandBufferGetInfoExpParams( const struct ur_command_buffer_get_info_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size) { diff --git a/source/mock/ur_mock_helpers.cpp b/source/mock/ur_mock_helpers.cpp index a8304492a6..4d04a96566 100644 --- a/source/mock/ur_mock_helpers.cpp +++ b/source/mock/ur_mock_helpers.cpp @@ -13,10 +13,8 @@ #include "ur_mock_helpers.hpp" namespace mock { +static callbacks_t callbacks = {}; -callbacks_t &getCallbacks() { - static callbacks_t callbacks; - return callbacks; -} +callbacks_t &getCallbacks() { return callbacks; } } // namespace mock diff --git a/source/ur/ur.cpp b/source/ur/ur.cpp index cff431069a..76e6bb6d31 100644 --- a/source/ur/ur.cpp +++ b/source/ur/ur.cpp @@ -14,10 +14,19 @@ // Controls tracing UR calls from within the UR itself. bool PrintTrace = [] { + const char *UrRet = std::getenv("SYCL_UR_TRACE"); const char *PiRet = std::getenv("SYCL_PI_TRACE"); - const char *Trace = PiRet ? PiRet : nullptr; - const int TraceValue = Trace ? std::stoi(Trace) : 0; - if (TraceValue == -1 || TraceValue == 2) { // Means print all traces + const char *Trace = UrRet ? UrRet : (PiRet ? PiRet : nullptr); + int TraceValue = 0; + if (Trace) { + try { + TraceValue = std::stoi(Trace); + } catch (...) { + // no-op, we don't have a logger yet to output an error. + } + } + + if (TraceValue == -1 || TraceValue == 2) { return true; } return false; diff --git a/source/ur/ur.hpp b/source/ur/ur.hpp index e8a1ed56b9..0639a9d9be 100644 --- a/source/ur/ur.hpp +++ b/source/ur/ur.hpp @@ -53,6 +53,10 @@ const ur_command_t UR_EXT_COMMAND_TYPE_USER = #define __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE \ "@reqd_work_group_size" #define __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING "@global_id_mapping" +#define __SYCL_UR_PROGRAM_METADATA_TAG_MAX_WORK_GROUP_SIZE \ + "@max_work_group_size" +#define __SYCL_UR_PROGRAM_METADATA_TAG_MAX_LINEAR_WORK_GROUP_SIZE \ + "@max_linear_work_group_size" #define __SYCL_UR_PROGRAM_METADATA_TAG_NEED_FINALIZATION "Requires finalization" // Terminates the process with a catastrophic error message. diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 2e762b985b..57d2dc35ac 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -7,7 +7,7 @@ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * * @file ur_api.cpp - * @version v0.10-r0 + * @version v0.11-r0 * */ #include "ur_api.h" @@ -143,7 +143,7 @@ ur_result_t UR_APICALL urLoaderConfigEnableLayer( hLoaderConfig, ///< [in] Handle to config object the layer will be enabled for. const char * pLayerName ///< [in] Null terminated string containing the name of the layer to - ///< enable. + ///< enable. Empty if none are enabled. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -278,6 +278,7 @@ ur_result_t UR_APICALL urLoaderTearDown(void) { /// - ::UR_RESULT_ERROR_DEVICE_LOST /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_SIZE +/// + `NumEntries == 0 && phAdapters != NULL` ur_result_t UR_APICALL urAdapterGet( uint32_t NumEntries, ///< [in] the number of adapters to be added to phAdapters. @@ -287,7 +288,7 @@ ur_result_t UR_APICALL urAdapterGet( ur_adapter_handle_t * phAdapters, ///< [out][optional][range(0, NumEntries)] array of handle of adapters. ///< If NumEntries is less than the number of adapters available, then - ///< ::urAdapterGet shall only retrieve that number of platforms. + ///< ::urAdapterGet shall only retrieve that number of adapters. uint32_t * pNumAdapters ///< [out][optional] returns the total number of adapters available. ) { @@ -997,7 +998,7 @@ ur_result_t UR_APICALL urDeviceGetNativeHandle( /// - ::UR_RESULT_ERROR_DEVICE_LOST /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE -/// + `NULL == hPlatform` +/// + `NULL == hAdapter` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == phDevice` /// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE @@ -1005,7 +1006,8 @@ ur_result_t UR_APICALL urDeviceGetNativeHandle( ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_native_handle_t hNativeDevice, ///< [in][nocheck] the native handle of the device. - ur_platform_handle_t hPlatform, ///< [in] handle of the platform instance + ur_adapter_handle_t + hAdapter, ///< [in] handle of the adapter to which `hNativeDevice` belongs const ur_device_native_properties_t * pProperties, ///< [in][optional] pointer to native device properties struct. ur_device_handle_t @@ -1305,8 +1307,7 @@ ur_result_t UR_APICALL urContextSetExtendedDeleter( /// /// @details /// - The primary ::ur_image_format_t that must be supported by all the -/// adapters are {UR_IMAGE_CHANNEL_ORDER_RGBA, -/// UR_IMAGE_CHANNEL_TYPE_UNORM_INT8}, +/// adapters are {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT8}, /// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT16}, /// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT8}, /// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT16}, @@ -2981,6 +2982,11 @@ ur_result_t UR_APICALL urProgramGetBuildInfo( /// + `NULL == pSpecConstants` /// - ::UR_RESULT_ERROR_INVALID_SIZE /// + `count == 0` +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// + A pSpecConstant entry contains a size that does not match that of the specialization constant in the module. +/// + A pSpecConstant entry contains a nullptr pValue. +/// - ::UR_RESULT_ERROR_INVALID_SPEC_ID +/// + Any id specified in a pSpecConstant entry is not a valid specialization constant identifier. ur_result_t UR_APICALL urProgramSetSpecializationConstants( ur_program_handle_t hProgram, ///< [in] handle of the Program object uint32_t count, ///< [in] the number of elements in the pSpecConstants array @@ -3113,6 +3119,7 @@ ur_result_t UR_APICALL urKernelSetArgValue( *pProperties, ///< [in][optional] pointer to value properties. const void *pArgValue ///< [in] argument value represented as matching arg type. + ///< The data pointed to will be copied and therefore can be reused on return. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -3209,7 +3216,7 @@ ur_result_t UR_APICALL urKernelGetInfo( /// + `NULL == hKernel` /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE < propName` +/// + `::UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE < propName` ur_result_t UR_APICALL urKernelGetGroupInfo( ur_kernel_handle_t hKernel, ///< [in] handle of the Kernel object ur_device_handle_t hDevice, ///< [in] handle of the Device object @@ -3470,6 +3477,11 @@ ur_result_t UR_APICALL urKernelSetArgMemObj( /// + `count == 0` /// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE /// + If ::UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS query is false +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// + A pSpecConstant entry contains a size that does not match that of the specialization constant in the module. +/// + A pSpecConstant entry contains a nullptr pValue. +/// - ::UR_RESULT_ERROR_INVALID_SPEC_ID +/// + Any id specified in a pSpecConstant entry is not a valid specialization constant identifier. ur_result_t UR_APICALL urKernelSetSpecializationConstants( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object uint32_t count, ///< [in] the number of elements in the pSpecConstants array @@ -4182,17 +4194,16 @@ ur_result_t UR_APICALL urEnqueueKernelLaunch( pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that ///< specify the number of local work-items forming a work-group that will ///< execute the kernel function. - ///< If nullptr, the runtime implementation will choose the work-group - ///< size. + ///< If nullptr, the runtime implementation will choose the work-group size. uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -4240,7 +4251,8 @@ ur_result_t UR_APICALL urEnqueueEventsWait( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -4290,7 +4302,8 @@ ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -4346,7 +4359,8 @@ ur_result_t UR_APICALL urEnqueueMemBufferRead( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -4404,7 +4418,8 @@ ur_result_t UR_APICALL urEnqueueMemBufferWrite( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -4482,7 +4497,8 @@ ur_result_t UR_APICALL urEnqueueMemBufferReadRect( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -4563,7 +4579,8 @@ ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -4616,7 +4633,8 @@ ur_result_t UR_APICALL urEnqueueMemBufferCopy( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -4686,7 +4704,8 @@ ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -4744,7 +4763,8 @@ ur_result_t UR_APICALL urEnqueueMemBufferFill( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -4806,7 +4826,8 @@ ur_result_t UR_APICALL urEnqueueMemImageRead( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -4869,7 +4890,8 @@ ur_result_t UR_APICALL urEnqueueMemImageWrite( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -4927,7 +4949,8 @@ ur_result_t UR_APICALL urEnqueueMemImageCopy( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -4989,7 +5012,8 @@ ur_result_t UR_APICALL urEnqueueMemBufferMap( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent, ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. void **ppRetMap ///< [out] return mapped pointer. TODO: move it before ///< numEventsInWaitList? ) { @@ -5039,7 +5063,8 @@ ur_result_t UR_APICALL urEnqueueMemUnmap( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -5092,7 +5117,8 @@ ur_result_t UR_APICALL urEnqueueUSMFill( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -5141,7 +5167,8 @@ ur_result_t UR_APICALL urEnqueueUSMMemcpy( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -5194,7 +5221,8 @@ ur_result_t UR_APICALL urEnqueueUSMPrefetch( ///< command does not wait on any event to complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -5293,11 +5321,11 @@ ur_result_t UR_APICALL urEnqueueUSMFill2D( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -5352,11 +5380,11 @@ ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -5398,11 +5426,11 @@ ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -5444,11 +5472,11 @@ ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -5495,9 +5523,10 @@ ur_result_t UR_APICALL urEnqueueReadHostPipe( ///< events that must be complete before the host pipe read. ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * - phEvent ///< [out][optional] returns an event object that identifies this read - ///< command + phEvent ///< [out][optional] returns an event object that identifies this read command ///< and can be used to query or queue a wait for this command to complete. + ///< If phEventWaitList and phEvent are not NULL, phEvent must not refer to + ///< an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -5547,6 +5576,8 @@ ur_result_t UR_APICALL urEnqueueWriteHostPipe( ur_event_handle_t * phEvent ///< [out][optional] returns an event object that identifies this write command ///< and can be used to query or queue a wait for this command to complete. + ///< If phEventWaitList and phEvent are not NULL, phEvent must not refer to + ///< an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -5877,7 +5908,8 @@ ur_result_t UR_APICALL urBindlessImagesImageCopyExp( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -5993,8 +6025,8 @@ ur_result_t UR_APICALL urBindlessImagesMipmapFreeExp( /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION /// + `::UR_EXP_EXTERNAL_MEM_TYPE_WIN32_NT_DX12_RESOURCE < memHandleType` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pInteropMemDesc` -/// + `NULL == phInteropMem` +/// + `NULL == pExternalMemDesc` +/// + `NULL == phExternalMem` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT @@ -6004,17 +6036,17 @@ ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( size_t size, ///< [in] size of the external memory ur_exp_external_mem_type_t memHandleType, ///< [in] type of external memory handle - ur_exp_interop_mem_desc_t - *pInteropMemDesc, ///< [in] the interop memory descriptor - ur_exp_interop_mem_handle_t - *phInteropMem ///< [out] interop memory handle to the external memory + ur_exp_external_mem_desc_t + *pExternalMemDesc, ///< [in] the external memory descriptor + ur_exp_external_mem_handle_t + *phExternalMem ///< [out] external memory handle to the external memory ) { ur_result_t result = UR_RESULT_SUCCESS; return result; } /////////////////////////////////////////////////////////////////////////////// -/// @brief Map an interop memory handle to an image memory handle +/// @brief Map an external memory handle to an image memory handle /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -6024,7 +6056,7 @@ ur_result_t UR_APICALL urBindlessImagesImportExternalMemoryExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hContext` /// + `NULL == hDevice` -/// + `NULL == hInteropMem` +/// + `NULL == hExternalMem` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pImageFormat` /// + `NULL == pImageDesc` @@ -6042,8 +6074,8 @@ ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( const ur_image_format_t *pImageFormat, ///< [in] pointer to image format specification const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description - ur_exp_interop_mem_handle_t - hInteropMem, ///< [in] interop memory handle to the external memory + ur_exp_external_mem_handle_t + hExternalMem, ///< [in] external memory handle to the external memory ur_exp_image_mem_native_handle_t * phImageMem ///< [out] image memory handle to the externally allocated memory ) { @@ -6052,7 +6084,40 @@ ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( } /////////////////////////////////////////////////////////////////////////////// -/// @brief Release interop memory +/// @brief Map an external memory handle to a device memory region described by +/// void* +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hContext` +/// + `NULL == hDevice` +/// + `NULL == hExternalMem` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == ppRetMem` +/// - ::UR_RESULT_ERROR_INVALID_CONTEXT +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE +/// - ::UR_RESULT_ERROR_INVALID_OPERATION +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +ur_result_t UR_APICALL urBindlessImagesMapExternalLinearMemoryExp( + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_device_handle_t hDevice, ///< [in] handle of the device object + uint64_t offset, ///< [in] offset into memory region to map + uint64_t size, ///< [in] size of memory region to map + ur_exp_external_mem_handle_t + hExternalMem, ///< [in] external memory handle to the external memory + void **ppRetMem ///< [out] pointer of the externally allocated memory +) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Release external memory /// /// @remarks /// _Analogues_ @@ -6066,14 +6131,14 @@ ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hContext` /// + `NULL == hDevice` -/// + `NULL == hInteropMem` +/// + `NULL == hExternalMem` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE -ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( +ur_result_t UR_APICALL urBindlessImagesReleaseExternalMemoryExp( ur_context_handle_t hContext, ///< [in] handle of the context object ur_device_handle_t hDevice, ///< [in] handle of the device object - ur_exp_interop_mem_handle_t - hInteropMem ///< [in][release] handle of interop memory to be destroyed + ur_exp_external_mem_handle_t + hExternalMem ///< [in][release] handle of external memory to be destroyed ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -6097,8 +6162,8 @@ ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION /// + `::UR_EXP_EXTERNAL_SEMAPHORE_TYPE_WIN32_NT_DX12_FENCE < semHandleType` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pInteropSemaphoreDesc` -/// + `NULL == phInteropSemaphore` +/// + `NULL == pExternalSemaphoreDesc` +/// + `NULL == phExternalSemaphore` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( @@ -6106,10 +6171,10 @@ ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( ur_device_handle_t hDevice, ///< [in] handle of the device object ur_exp_external_semaphore_type_t semHandleType, ///< [in] type of external memory handle - ur_exp_interop_semaphore_desc_t - *pInteropSemaphoreDesc, ///< [in] the interop semaphore descriptor - ur_exp_interop_semaphore_handle_t * - phInteropSemaphore ///< [out] interop semaphore handle to the external semaphore + ur_exp_external_semaphore_desc_t + *pExternalSemaphoreDesc, ///< [in] the external semaphore descriptor + ur_exp_external_semaphore_handle_t * + phExternalSemaphore ///< [out] external semaphore handle to the external semaphore ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -6130,14 +6195,14 @@ ur_result_t UR_APICALL urBindlessImagesImportExternalSemaphoreExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hContext` /// + `NULL == hDevice` -/// + `NULL == hInteropSemaphore` +/// + `NULL == hExternalSemaphore` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( ur_context_handle_t hContext, ///< [in] handle of the context object ur_device_handle_t hDevice, ///< [in] handle of the device object - ur_exp_interop_semaphore_handle_t - hInteropSemaphore ///< [in][release] handle of interop semaphore to be destroyed + ur_exp_external_semaphore_handle_t + hExternalSemaphore ///< [in][release] handle of external semaphore to be destroyed ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -6162,8 +6227,8 @@ ur_result_t UR_APICALL urBindlessImagesReleaseExternalSemaphoreExp( /// - ::UR_RESULT_ERROR_INVALID_VALUE ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( ur_queue_handle_t hQueue, ///< [in] handle of the queue object - ur_exp_interop_semaphore_handle_t - hSemaphore, ///< [in] interop semaphore handle + ur_exp_external_semaphore_handle_t + hSemaphore, ///< [in] external semaphore handle bool hasWaitValue, ///< [in] indicates whether the samephore is capable and should wait on a ///< certain value. @@ -6179,7 +6244,8 @@ ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -6205,8 +6271,8 @@ ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( /// - ::UR_RESULT_ERROR_INVALID_VALUE ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( ur_queue_handle_t hQueue, ///< [in] handle of the queue object - ur_exp_interop_semaphore_handle_t - hSemaphore, ///< [in] interop semaphore handle + ur_exp_external_semaphore_handle_t + hSemaphore, ///< [in] external semaphore handle bool hasSignalValue, ///< [in] indicates whether the samephore is capable and should signal on a ///< certain value. @@ -6222,7 +6288,8 @@ ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( ///< must be complete. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command instance. + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -6346,12 +6413,23 @@ ur_result_t UR_APICALL urCommandBufferFinalizeExp( /// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION /// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE /// - ::UR_RESULT_ERROR_INVALID_VALUE +/// + `phKernelAlternatives == NULL && numKernelAlternatives > 0` +/// + `phKernelAlternatives != NULL && numKernelAlternatives == 0` +/// + If `phKernelAlternatives` contains `hKernel` /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +/// - ::UR_RESULT_ERROR_INVALID_OPERATION - "phCommand is not NULL and hCommandBuffer is not updatable." ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ur_exp_command_buffer_handle_t hCommandBuffer, ///< [in] Handle of the command-buffer object. @@ -6362,16 +6440,37 @@ ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( const size_t * pGlobalWorkSize, ///< [in] Global work size to use when executing kernel. const size_t * - pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. + pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. If this + ///< parameter is nullptr, then a local work size will be generated by the + ///< implementation. + uint32_t + numKernelAlternatives, ///< [in] The number of kernel alternatives provided in + ///< phKernelAlternatives. + ur_kernel_handle_t * + phKernelAlternatives, ///< [in][optional][range(0, numKernelAlternatives)] List of kernel handles + ///< that might be used to update the kernel in this + ///< command after the command-buffer is finalized. The default kernel + ///< `hKernel` is implicitly marked as an alternative. It's + ///< invalid to specify it as part of this list. uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * pSyncPoint, ///< [out][optional] Sync point associated with this command. - ur_exp_command_buffer_command_handle_t - *phCommand ///< [out][optional] Handle to this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t * + phCommand ///< [out][optional] Handle to this command. Only available if the + ///< command-buffer is updatable. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -6399,6 +6498,13 @@ ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( @@ -6412,8 +6518,19 @@ ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -6443,6 +6560,13 @@ ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( @@ -6458,8 +6582,19 @@ ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -6483,6 +6618,13 @@ ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( @@ -6498,8 +6640,19 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -6524,6 +6677,13 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( @@ -6539,8 +6699,19 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -6565,6 +6736,13 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( @@ -6579,8 +6757,19 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -6604,6 +6793,13 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( @@ -6626,8 +6822,19 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -6652,6 +6859,13 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( @@ -6680,8 +6894,19 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -6706,6 +6931,13 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` /// + `pSyncPointWaitList != NULL && numSyncPointsInWaitList == 0` /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( @@ -6732,8 +6964,19 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] Sync point associated with this command. + pSyncPoint, ///< [out][optional] Sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -6760,6 +7003,13 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT /// - ::UR_RESULT_ERROR_INVALID_SIZE /// + If `offset + size` results in an out-of-bounds access. +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( @@ -6776,8 +7026,19 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -6811,6 +7072,13 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( /// - ::UR_RESULT_ERROR_INVALID_SIZE /// + `size == 0` /// + If `size` is higher than the allocation size of `pMemory` +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( @@ -6824,8 +7092,19 @@ ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -6859,6 +7138,13 @@ ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( /// - ::UR_RESULT_ERROR_INVALID_SIZE /// + `size == 0` /// + If `size` is higher than the allocation size of `pMemory` +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If the device associated with `hCommandBuffer` does not support UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP and either `phEvent` or `phEventWaitList` are not NULL. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( @@ -6872,8 +7158,19 @@ ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( const ur_exp_command_buffer_sync_point_t * pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. ur_exp_command_buffer_sync_point_t * - pSyncPoint ///< [out][optional] sync point associated with this command. + pSyncPoint, ///< [out][optional] sync point associated with this command. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that will be signaled by the + ///< completion of this command in the next execution of the + ///< command-buffer. + ur_exp_command_buffer_command_handle_t + *phCommand ///< [out][optional] Handle to this command. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -6911,7 +7208,8 @@ ur_result_t UR_APICALL urCommandBufferEnqueueExp( ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< command-buffer execution instance. + ///< command-buffer execution instance. If phEventWaitList and phEvent are + ///< not NULL, phEvent must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -6961,9 +7259,10 @@ ur_result_t UR_APICALL urCommandBufferReleaseCommandExp( } /////////////////////////////////////////////////////////////////////////////// -/// @brief Update a kernel launch command in a finalized command-buffer. This -/// entry-point is synchronous and may block if the command-buffer is -/// executing when the entry-point is called. +/// @brief Update a kernel launch command in a finalized command-buffer. +/// +/// @details +/// This entry-point is synchronous and may block if the command-buffer is executing when the entry-point is called. /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -6979,18 +7278,17 @@ ur_result_t UR_APICALL urCommandBufferReleaseCommandExp( /// - ::UR_RESULT_ERROR_INVALID_OPERATION /// + If ::ur_exp_command_buffer_desc_t::isUpdatable was not set to true on creation of the command buffer `hCommand` belongs to. /// + If the command-buffer `hCommand` belongs to has not been finalized. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and different from the work-dim used on creation of `hCommand`. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a non-NULL value and `pUpdateKernelLaunch->pNewGlobalWorkSize` is NULL. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a non-NULL value when `hCommand` was created with a NULL local work size. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a NULL value when `hCommand` was created with a non-NULL local work size. -/// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_COMMAND_HANDLE_EXP +/// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_COMMAND_HANDLE_EXP - "If `hCommand` is not a kernel execution command." /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT /// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX /// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION /// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION +/// + `pUpdateKernelLaunch->newWorkDim < 1 || pUpdateKernelLaunch->newWorkDim > 3` /// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE /// - ::UR_RESULT_ERROR_INVALID_VALUE +/// + If `pUpdateKernelLaunch->hNewKernel` was not passed to the `hKernel` or `phKernelAlternatives` parameters of ::urCommandBufferAppendKernelLaunchExp when this command was created. +/// + If `pUpdateKernelLaunch->newWorkDim` is different from the current workDim in `hCommand` and, pUpdateKernelLaunch->pNewGlobalWorkSize, or pUpdateKernelLaunch->pNewGlobalWorkOffset are nullptr. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( @@ -7003,6 +7301,78 @@ ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Get a new event that will be signaled the next time the command in the +/// command-buffer executes. +/// +/// @details +/// It is the users responsibility to release the returned `phSignalEvent`. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hCommand` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == phSignalEvent` +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_EVENTS is not supported by the device associated with `hCommand`. +/// - ::UR_RESULT_ERROR_INVALID_OPERATION +/// + If ::ur_exp_command_buffer_desc_t::isUpdatable was not set to true on creation of the command buffer `hCommand` belongs to. +/// + If the command-buffer `hCommand` belongs to has not been finalized. +/// + If no `phEvent` parameter was set on creation of the command associated with `hCommand`. +/// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_COMMAND_HANDLE_EXP +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +ur_result_t UR_APICALL urCommandBufferUpdateSignalEventExp( + ur_exp_command_buffer_command_handle_t + hCommand, ///< [in] Handle of the command-buffer command to update. + ur_event_handle_t *phSignalEvent ///< [out] Event to be signaled. +) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Set the list of wait events for a command to depend on to a list of +/// new events. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hCommand` +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_EVENTS is not supported by the device associated with `hCommand`. +/// - ::UR_RESULT_ERROR_INVALID_OPERATION +/// + If ::ur_exp_command_buffer_desc_t::isUpdatable was not set to true on creation of the command buffer `hCommand` belongs to. +/// + If the command-buffer `hCommand` belongs to has not been finalized. +/// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_COMMAND_HANDLE_EXP +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// + If `numEventsInWaitList` does not match the number of wait events set when the command associated with `hCommand` was created. +/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +ur_result_t UR_APICALL urCommandBufferUpdateWaitEventsExp( + ur_exp_command_buffer_command_handle_t + hCommand, ///< [in] Handle of the command-buffer command to update. + uint32_t numEventsInWaitList, ///< [in] Size of the event wait list. + const ur_event_handle_t * + phEventWaitList ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the command execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating no wait events. +) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Get command-buffer object information. /// @@ -7014,7 +7384,7 @@ ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hCommandBuffer` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_EXP_COMMAND_BUFFER_INFO_REFERENCE_COUNT < propName` +/// + `::UR_EXP_COMMAND_BUFFER_INFO_DESCRIPTOR < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -7126,17 +7496,16 @@ ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that ///< specify the number of local work-items forming a work-group that will ///< execute the kernel function. - ///< If nullptr, the runtime implementation will choose the work-group - ///< size. + ///< If nullptr, the runtime implementation will choose the work-group size. uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait - ///< event. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -7194,8 +7563,7 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait - ///< events. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [in,out] return an event object that identifies this particular kernel ///< execution instance. Profiling information can be queried @@ -7203,7 +7571,9 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` ///< reports the timestamp at the time of the call to this function. ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` - ///< reports the timestamp recorded when the command is executed on the device. + ///< reports the timestamp recorded when the command is executed on the + ///< device. If phEventWaitList and phEvent are not NULL, phEvent must not + ///< refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -7282,7 +7652,9 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( ///< the numEventsInWaitList must be 0, indicating that no wait event. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies this particular - ///< kernel execution instance. + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList + ///< array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; @@ -7645,7 +8017,8 @@ ur_result_t UR_APICALL urEnqueueNativeCommandExp( ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [out][optional] return an event object that identifies the work that has - ///< been enqueued in nativeEnqueueFunc. + ///< been enqueued in nativeEnqueueFunc. If phEventWaitList and phEvent are + ///< not NULL, phEvent must not refer to an element of the phEventWaitList array. ) { ur_result_t result = UR_RESULT_SUCCESS; return result; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e648bac44a..e7514cefd8 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -12,8 +12,10 @@ FetchContent_Declare( set(UR_TEST_DEVICES_COUNT 1 CACHE STRING "Count of devices on which conformance and adapters tests will be run") set(UR_TEST_PLATFORMS_COUNT 1 CACHE STRING "Count of platforms on which conformance and adapters tests will be run") +set(UR_TEST_FUZZTESTS ON CACHE BOOL "Run fuzz tests if using clang and UR_DPCXX is specified") # For Windows: Prevent overriding the parent project's compiler/linker settings set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) +set(INSTALL_GTEST OFF) FetchContent_MakeAvailable(googletest) enable_testing() @@ -30,6 +32,6 @@ add_subdirectory(mock) if(UR_BUILD_TOOLS) add_subdirectory(tools) endif() -if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND UR_DPCXX) +if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND UR_DPCXX AND UR_TEST_FUZZTESTS) add_subdirectory(fuzz) endif() diff --git a/test/adapters/CMakeLists.txt b/test/adapters/CMakeLists.txt index 969d8a4e86..bbe6826b09 100644 --- a/test/adapters/CMakeLists.txt +++ b/test/adapters/CMakeLists.txt @@ -63,11 +63,10 @@ function(add_adapter_memcheck_test name) add_test(NAME ${test_name} COMMAND ${CMAKE_COMMAND} -D TEST_FILE=valgrind - -D TEST_ARGS="--tool=memcheck --leak-check=full $ --devices_count=${UR_TEST_DEVICES_COUNT} --platforms_count=${UR_TEST_DEVICES_COUNT}" + -D TEST_ARGS="--tool=memcheck --leak-check=full $ --backend=${backend} --devices_count=${UR_TEST_DEVICES_COUNT} --platforms_count=${UR_TEST_DEVICES_COUNT}" -D MODE=stderr -D MATCH_FILE=${CMAKE_CURRENT_SOURCE_DIR}/${name}_memcheck.match -P ${PROJECT_SOURCE_DIR}/cmake/match.cmake - DEPENDS ${TEST_TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} ) @@ -85,6 +84,6 @@ if(UR_BUILD_ADAPTER_HIP OR UR_BUILD_ADAPTER_ALL) add_subdirectory(hip) endif() -if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_ALL) +if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_L0_V2 OR UR_BUILD_ADAPTER_ALL) add_subdirectory(level_zero) endif() diff --git a/test/adapters/cuda/CMakeLists.txt b/test/adapters/cuda/CMakeLists.txt index 66c1fa4b1e..3f2f0c270c 100644 --- a/test/adapters/cuda/CMakeLists.txt +++ b/test/adapters/cuda/CMakeLists.txt @@ -16,6 +16,7 @@ add_adapter_test(cuda urQueueGetNativeHandle.cpp kernel_tests.cpp memory_tests.cpp + event_tests.cpp #FIXME: make this cleaner ${CMAKE_CURRENT_SOURCE_DIR}/../../../source/adapters/cuda/queue.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../../source/adapters/cuda/common.cpp @@ -28,4 +29,4 @@ target_include_directories(test-adapter-cuda PRIVATE ${PROJECT_SOURCE_DIR}/source/adapters/cuda ) -target_link_libraries(test-adapter-cuda PRIVATE cudadrv) +target_link_libraries(test-adapter-cuda PRIVATE cudadrv ${PROJECT_NAME}::umf) diff --git a/test/adapters/cuda/event_tests.cpp b/test/adapters/cuda/event_tests.cpp new file mode 100644 index 0000000000..13ffea858f --- /dev/null +++ b/test/adapters/cuda/event_tests.cpp @@ -0,0 +1,36 @@ +// Copyright (C) 2022-2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "device.hpp" +#include "event.hpp" +#include "fixtures.h" +#include "raii.h" + +using cudaEventTest = uur::urContextTest; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(cudaEventTest); + +// Testing the urEventGetInfo behaviour for natively constructed (Cuda) events. +// Backend interop APIs can lead to creating event objects that are not fully +// initialized. In the Cuda adapter, an event can have nullptr command queue +// because the interop API does not associate a UR-owned queue with the event. +TEST_P(cudaEventTest, GetQueueFromEventCreatedWithNativeHandle) { + CUcontext cuda_ctx = device->getNativeContext(); + EXPECT_NE(cuda_ctx, nullptr); + RAIICUevent cuda_event; + ASSERT_SUCCESS_CUDA(cuCtxSetCurrent(cuda_ctx)); + ASSERT_SUCCESS_CUDA(cuEventCreate(cuda_event.ptr(), CU_EVENT_DEFAULT)); + + auto native_event = reinterpret_cast(cuda_event.get()); + uur::raii::Event event{nullptr}; + ASSERT_SUCCESS(urEventCreateWithNativeHandle(native_event, context, nullptr, + event.ptr())); + EXPECT_NE(event, nullptr); + + size_t ret_size{}; + ur_queue_handle_t q{}; + ASSERT_EQ_RESULT(urEventGetInfo(event, UR_EVENT_INFO_COMMAND_QUEUE, + sizeof(ur_queue_handle_t), &q, &ret_size), + UR_RESULT_ERROR_ADAPTER_SPECIFIC); +} diff --git a/test/adapters/cuda/raii.h b/test/adapters/cuda/raii.h new file mode 100644 index 0000000000..e401082e48 --- /dev/null +++ b/test/adapters/cuda/raii.h @@ -0,0 +1,25 @@ +// Copyright (C) 2022-2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef UR_TEST_CONFORMANCE_ADAPTERS_CUDA_RAII_H_INCLUDED +#define UR_TEST_CONFORMANCE_ADAPTERS_CUDA_RAII_H_INCLUDED + +#include "uur/raii.h" +#include + +struct RAIICUevent { + CUevent handle = nullptr; + + ~RAIICUevent() { + if (handle) { + cuEventDestroy(handle); + } + } + + CUevent *ptr() { return &handle; } + CUevent get() { return handle; } +}; + +#endif // UR_TEST_CONFORMANCE_ADAPTERS_CUDA_RAII_H_INCLUDED diff --git a/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp b/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp index 6eb502907b..e4ac022507 100644 --- a/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp +++ b/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp @@ -17,6 +17,6 @@ TEST_F(urCudaDeviceCreateWithNativeHandle, Success) { ur_native_handle_t nativeCuda = static_cast(cudaDevice); ur_device_handle_t urDevice; - ASSERT_SUCCESS(urDeviceCreateWithNativeHandle(nativeCuda, platform, nullptr, + ASSERT_SUCCESS(urDeviceCreateWithNativeHandle(nativeCuda, adapter, nullptr, &urDevice)); } diff --git a/test/adapters/cuda/urEventCreateWithNativeHandle.cpp b/test/adapters/cuda/urEventCreateWithNativeHandle.cpp index 68a99bba4b..6079eb194c 100644 --- a/test/adapters/cuda/urEventCreateWithNativeHandle.cpp +++ b/test/adapters/cuda/urEventCreateWithNativeHandle.cpp @@ -4,24 +4,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "fixtures.h" -#include "uur/raii.h" +#include "raii.h" using urCudaEventCreateWithNativeHandleTest = uur::urQueueTest; UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCudaEventCreateWithNativeHandleTest); -struct RAIICUevent { - CUevent handle = nullptr; - - ~RAIICUevent() { - if (handle) { - cuEventDestroy(handle); - } - } - - CUevent *ptr() { return &handle; } - CUevent get() { return handle; } -}; - TEST_P(urCudaEventCreateWithNativeHandleTest, Success) { RAIICUevent cuda_event; ASSERT_SUCCESS_CUDA(cuEventCreate(cuda_event.ptr(), CU_EVENT_DEFAULT)); diff --git a/test/adapters/hip/CMakeLists.txt b/test/adapters/hip/CMakeLists.txt index 3496f71bd0..b17d00003f 100644 --- a/test/adapters/hip/CMakeLists.txt +++ b/test/adapters/hip/CMakeLists.txt @@ -11,6 +11,7 @@ add_adapter_test(hip urDeviceGetNativeHandle.cpp urEventGetNativeHandle.cpp test_context.cpp + test_event.cpp ENVIRONMENT "UR_ADAPTERS_FORCE_LOAD=\"$\"" ) @@ -26,4 +27,4 @@ target_compile_definitions(test-adapter-hip PRIVATE ${HIP_COMPILE_DEFINITIONS} ) -target_link_libraries(test-adapter-hip PRIVATE rocmdrv) +target_link_libraries(test-adapter-hip PRIVATE rocmdrv ${PROJECT_NAME}::umf) diff --git a/test/adapters/hip/test_context.cpp b/test/adapters/hip/test_context.cpp index c8dd7ac315..3b384dcbcf 100644 --- a/test/adapters/hip/test_context.cpp +++ b/test/adapters/hip/test_context.cpp @@ -24,14 +24,6 @@ TEST_P(urHipContextTest, ActiveContexts) { // ensure that the queue has the correct context ASSERT_EQ(context, queue->getContext()); - - // check that the current context is the active HIP context - hipCtx_t hipContext = nullptr; - ASSERT_SUCCESS_HIP(hipCtxGetCurrent(&hipContext)); - ASSERT_NE(hipContext, nullptr); - if (context->getDevices().size() == 1) { - ASSERT_EQ(hipContext, context->getDevices()[0]->getNativeContext()); - } } TEST_P(urHipContextTest, ActiveContextsThreads) { @@ -50,7 +42,6 @@ TEST_P(urHipContextTest, ActiveContextsThreads) { bool thread_done = false; auto test_thread = std::thread([&] { - hipCtx_t current = nullptr; { uur::raii::Queue queue = nullptr; ASSERT_SUCCESS( @@ -59,13 +50,6 @@ TEST_P(urHipContextTest, ActiveContextsThreads) { // ensure queue has the correct context ASSERT_EQ(queue->getContext(), context1); - - // check that the first context is now the active HIP context - ASSERT_SUCCESS_HIP(hipCtxGetCurrent(¤t)); - if (context1->getDevices().size() == 1) { - ASSERT_EQ(current, - context1->getDevices()[0]->getNativeContext()); - } } // mark the first set of processing as done and notify the main thread @@ -90,13 +74,6 @@ TEST_P(urHipContextTest, ActiveContextsThreads) { // ensure the queue has the correct context ASSERT_EQ(queue->getContext(), context2); - - // check that the second context is now the active HIP context - ASSERT_SUCCESS_HIP(hipCtxGetCurrent(¤t)); - if (context2->getDevices().size() == 1) { - ASSERT_EQ(current, - context2->getDevices()[0]->getNativeContext()); - } } }); diff --git a/test/adapters/hip/test_event.cpp b/test/adapters/hip/test_event.cpp new file mode 100644 index 0000000000..412b926314 --- /dev/null +++ b/test/adapters/hip/test_event.cpp @@ -0,0 +1,49 @@ +// Copyright (C) 2022-2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "event.hpp" +#include "fixtures.h" +#include "uur/raii.h" + +#include +#include + +struct RAIIHipEvent { + hipEvent_t handle = nullptr; + + ~RAIIHipEvent() { + if (handle) { + std::ignore = hipEventDestroy(handle); + } + } + + hipEvent_t *ptr() { return &handle; } + hipEvent_t get() { return handle; } +}; + +using urHipEventTest = uur::urContextTest; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urHipEventTest); + +// Testing the urEventGetInfo behaviour for natively constructed (HIP) events. +// Backend interop APIs can lead to creating event objects that are not fully +// initialized. In the Cuda adapter, an event can have nullptr command queue +// because the interop API does not associate a UR-owned queue with the event. +TEST_P(urHipEventTest, GetQueueFromEventCreatedWithNativeHandle) { + RAIIHipEvent hip_event; + ASSERT_SUCCESS_HIP( + hipEventCreateWithFlags(hip_event.ptr(), hipEventDefault)); + + auto native_event = reinterpret_cast(hip_event.get()); + uur::raii::Event event{nullptr}; + ASSERT_SUCCESS(urEventCreateWithNativeHandle(native_event, context, nullptr, + event.ptr())); + EXPECT_NE(event, nullptr); + + size_t ret_size{}; + ur_queue_handle_t q{}; + ASSERT_EQ_RESULT(urEventGetInfo(event, UR_EVENT_INFO_COMMAND_QUEUE, + sizeof(ur_queue_handle_t), &q, &ret_size), + UR_RESULT_ERROR_ADAPTER_SPECIFIC); +} diff --git a/test/adapters/hip/urContextGetNativeHandle.cpp b/test/adapters/hip/urContextGetNativeHandle.cpp index 738c75ce95..4d1ec4df2c 100644 --- a/test/adapters/hip/urContextGetNativeHandle.cpp +++ b/test/adapters/hip/urContextGetNativeHandle.cpp @@ -10,7 +10,6 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urHipContextGetNativeHandleTest); TEST_P(urHipContextGetNativeHandleTest, Success) { ur_native_handle_t native_context = 0; - ASSERT_SUCCESS(urContextGetNativeHandle(context, &native_context)); - hipCtx_t hip_context = reinterpret_cast(native_context); - std::ignore = hip_context; + auto status = urContextGetNativeHandle(context, &native_context); + ASSERT_EQ(status, UR_RESULT_ERROR_UNSUPPORTED_FEATURE); } diff --git a/test/adapters/level_zero/CMakeLists.txt b/test/adapters/level_zero/CMakeLists.txt index 237a2bbe9b..bfb02d37c2 100644 --- a/test/adapters/level_zero/CMakeLists.txt +++ b/test/adapters/level_zero/CMakeLists.txt @@ -3,70 +3,89 @@ # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -if(NOT UR_DPCXX) - # Tests that require kernels can't be used if we aren't generating - # device binaries - message(WARNING - "UR_DPCXX is not defined, skipping some adapter tests for level_zero") -else() - add_adapter_test(level_zero - FIXTURE KERNELS - SOURCES - urProgramLink.cpp - urKernelCreateWithNativeHandle.cpp - ENVIRONMENT - "UR_ADAPTERS_FORCE_LOAD=\"$\"" - ) - # TODO: valgrind tests require very new environment. - # Enable once all L0 runners are updated. - # add_adapter_memcheck_test(level_zero - # ENVIRONMENT - # "UR_ADAPTERS_FORCE_LOAD=\"$\"" - # ) +if(UR_BUILD_ADAPTER_L0) + if(NOT UR_DPCXX) + # Tests that require kernels can't be used if we aren't generating + # device binaries + message(WARNING + "UR_DPCXX is not defined, skipping some adapter tests for level_zero") + else() + add_adapter_test(level_zero + FIXTURE KERNELS + SOURCES + urProgramLink.cpp + urKernelCreateWithNativeHandle.cpp + urEventCreateWithNativeHandle.cpp + ENVIRONMENT + "UR_ADAPTERS_FORCE_LOAD=\"$\"" + ) + # TODO: valgrind tests require very new environment. + # Enable once all L0 runners are updated. + # add_adapter_memcheck_test(level_zero + # ENVIRONMENT + # "UR_ADAPTERS_FORCE_LOAD=\"$\"" + # ) - target_link_libraries(test-adapter-level_zero PRIVATE - LevelZeroLoader - LevelZeroLoader-Headers - ) + target_link_libraries(test-adapter-level_zero PRIVATE + LevelZeroLoader + LevelZeroLoader-Headers + ) - target_include_directories(test-adapter-level_zero PRIVATE - ${PROJECT_SOURCE_DIR}/source - ${PROJECT_SOURCE_DIR}/source/adapters/level_zero - LevelZeroLoader-Headers - ) + target_include_directories(test-adapter-level_zero PRIVATE + ${PROJECT_SOURCE_DIR}/source + ${PROJECT_SOURCE_DIR}/source/adapters/level_zero + LevelZeroLoader-Headers + ) - add_dependencies(test-adapter-level_zero - generate_device_binaries kernel_names_header) -endif() + add_dependencies(test-adapter-level_zero + generate_device_binaries kernel_names_header) + endif() -if(NOT WIN32) - # Make L0 use CallMap from a seprate shared lib so that we can access the map - # from the tests. This only seems to work on linux - add_library(zeCallMap SHARED zeCallMap.cpp) - target_compile_definitions(ur_adapter_level_zero PRIVATE UR_L0_CALL_COUNT_IN_TESTS) - target_link_libraries(ur_adapter_level_zero PRIVATE zeCallMap) + if(NOT WIN32 AND NOT UR_STATIC_ADAPTER_L0) + # Make L0 use CallMap from a seprate shared lib so that we can access the map + # from the tests. This only seems to work on linux + add_library(zeCallMap SHARED zeCallMap.cpp) + install_ur_library(zeCallMap) + target_compile_definitions(ur_adapter_level_zero PRIVATE UR_L0_CALL_COUNT_IN_TESTS) + # TODO: stop exporting internals like this for tests... + target_link_libraries(ur_adapter_level_zero PRIVATE zeCallMap) - add_adapter_test(level_zero_ze_calls - FIXTURE DEVICES - SOURCES - event_cache_tests.cpp - ENVIRONMENT - "UR_ADAPTERS_FORCE_LOAD=\"$\"" - "UR_L0_LEAKS_DEBUG=1" - ) + add_adapter_test(level_zero_ze_calls + FIXTURE DEVICES + SOURCES + event_cache_tests.cpp + ENVIRONMENT + "UR_ADAPTERS_FORCE_LOAD=\"$\"" + "UR_L0_LEAKS_DEBUG=1" + ) - target_link_libraries(test-adapter-level_zero_ze_calls PRIVATE zeCallMap) + target_link_libraries(test-adapter-level_zero_ze_calls PRIVATE zeCallMap) - add_adapter_test(level_zero_multi_queue + add_adapter_test(level_zero_multi_queue + FIXTURE DEVICES + SOURCES + multi_device_event_cache_tests.cpp + ENVIRONMENT + "UR_ADAPTERS_FORCE_LOAD=\"$\"" + "UR_L0_LEAKS_DEBUG=1" + ) + + target_link_libraries(test-adapter-level_zero_multi_queue PRIVATE zeCallMap) + endif() + + add_adapter_test(level_zero_ipc FIXTURE DEVICES SOURCES - multi_device_event_cache_tests.cpp + ipc.cpp ENVIRONMENT "UR_ADAPTERS_FORCE_LOAD=\"$\"" - "UR_L0_LEAKS_DEBUG=1" ) - target_link_libraries(test-adapter-level_zero_multi_queue PRIVATE zeCallMap) + target_link_libraries(test-adapter-level_zero_ipc PRIVATE + ur_umf + ) endif() -add_subdirectory(v2) +if(UR_BUILD_ADAPTER_L0_V2) + add_subdirectory(v2) +endif() diff --git a/test/adapters/level_zero/ipc.cpp b/test/adapters/level_zero/ipc.cpp new file mode 100644 index 0000000000..58ea56ad7c --- /dev/null +++ b/test/adapters/level_zero/ipc.cpp @@ -0,0 +1,51 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include + +#ifndef ASSERT_UMF_SUCCESS +#define ASSERT_UMF_SUCCESS(ACTUAL) ASSERT_EQ(ACTUAL, UMF_RESULT_SUCCESS) +#endif + +using urL0IpcTest = uur::urContextTest; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urL0IpcTest); + +TEST_P(urL0IpcTest, SuccessHostL0Ipc) { + ur_device_usm_access_capability_flags_t hostUSMSupport = 0; + ASSERT_SUCCESS(uur::GetDeviceUSMHostSupport(device, hostUSMSupport)); + if (!hostUSMSupport) { + GTEST_SKIP() << "Host USM is not supported."; + } + + void *ptr = nullptr; + size_t allocSize = sizeof(int); + ASSERT_SUCCESS(urUSMHostAlloc(context, nullptr, nullptr, allocSize, &ptr)); + ASSERT_NE(ptr, nullptr); + + umf_memory_pool_handle_t umfPool = umfPoolByPtr(ptr); + ASSERT_NE(umfPool, nullptr); + + umf_memory_provider_handle_t umfProvider = nullptr; + ASSERT_UMF_SUCCESS(umfPoolGetMemoryProvider(umfPool, &umfProvider)); + + size_t ipcHandleSize = 0; + ASSERT_UMF_SUCCESS( + umfMemoryProviderGetIPCHandleSize(umfProvider, &ipcHandleSize)); + + void *ipcHandle = nullptr; + ASSERT_UMF_SUCCESS( + umfMemoryProviderAlloc(umfProvider, ipcHandleSize, 0, &ipcHandle)); + ASSERT_UMF_SUCCESS( + umfMemoryProviderGetIPCHandle(umfProvider, ptr, allocSize, ipcHandle)); + + ASSERT_UMF_SUCCESS(umfMemoryProviderPutIPCHandle(umfProvider, ipcHandle)); + + ASSERT_UMF_SUCCESS( + umfMemoryProviderFree(umfProvider, ipcHandle, ipcHandleSize)); + + ASSERT_SUCCESS(urUSMFree(context, ptr)); +} diff --git a/test/adapters/level_zero/urEventCreateWithNativeHandle.cpp b/test/adapters/level_zero/urEventCreateWithNativeHandle.cpp new file mode 100644 index 0000000000..7e667bfe30 --- /dev/null +++ b/test/adapters/level_zero/urEventCreateWithNativeHandle.cpp @@ -0,0 +1,109 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "ur_api.h" +#include "uur/checks.h" +#include "ze_api.h" +#include +#include +#include + +using namespace std::chrono_literals; +using urLevelZeroEventNativeHandleTest = uur::urQueueTest; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urLevelZeroEventNativeHandleTest); + +#define TEST_MEMCPY_SIZE 4096 + +TEST_P(urLevelZeroEventNativeHandleTest, WaitForNative) { + ze_event_pool_desc_t desc; + desc.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC; + desc.pNext = nullptr; + desc.count = 1; + desc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + + ur_native_handle_t nativeContext; + ASSERT_SUCCESS(urContextGetNativeHandle(context, &nativeContext)); + + ur_native_handle_t nativeDevice; + ASSERT_SUCCESS(urDeviceGetNativeHandle(device, &nativeDevice)); + + ze_event_pool_handle_t pool = nullptr; + + ASSERT_EQ(zeEventPoolCreate((ze_context_handle_t)nativeContext, &desc, 1, + (ze_device_handle_t *)&nativeDevice, &pool), + ZE_RESULT_SUCCESS); + + ze_event_desc_t eventDesc; + eventDesc.pNext = nullptr; + eventDesc.stype = ZE_STRUCTURE_TYPE_EVENT_DESC; + eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + eventDesc.wait = 0; + + ze_event_handle_t zeEvent; + ASSERT_EQ(zeEventCreate(pool, &eventDesc, &zeEvent), ZE_RESULT_SUCCESS); + + ur_event_native_properties_t pprops; + pprops.isNativeHandleOwned = false; + pprops.pNext = nullptr; + pprops.stype = UR_STRUCTURE_TYPE_EVENT_NATIVE_PROPERTIES; + + ur_event_handle_t urEvent; + ASSERT_SUCCESS(urEventCreateWithNativeHandle((ur_native_handle_t)zeEvent, + context, &pprops, &urEvent)); + + int *src = (int *)malloc(TEST_MEMCPY_SIZE); + memset(src, 0xc, TEST_MEMCPY_SIZE); + + int *dst = (int *)malloc(TEST_MEMCPY_SIZE); + memset(dst, 0, TEST_MEMCPY_SIZE); + + int *dst2 = (int *)malloc(TEST_MEMCPY_SIZE); + memset(dst, 0, TEST_MEMCPY_SIZE); + + ur_event_handle_t memcpyEvent2; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, dst2, src, TEST_MEMCPY_SIZE, + 0, nullptr, &memcpyEvent2)); + + ur_event_handle_t memcpyEvent3; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, dst2, src, TEST_MEMCPY_SIZE, + 0, nullptr, &memcpyEvent3)); + + // just to make wait lists contain more than 1 event + ur_event_handle_t events[] = {memcpyEvent2, urEvent, memcpyEvent3}; + + ur_event_handle_t waitEvent; + ASSERT_SUCCESS( + urEnqueueEventsWaitWithBarrier(queue, 3, events, &waitEvent)); + + ur_event_handle_t memcpyEvent; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, dst, src, TEST_MEMCPY_SIZE, + 1, &waitEvent, &memcpyEvent)); + + // urQueueFinish would hang, so we flush and then wait + // some time to make sure the gpu had plenty of time + // to do the memcpy. + urQueueFlush(queue); + std::this_thread::sleep_for(500ms); + + ASSERT_NE(memcmp(src, dst, TEST_MEMCPY_SIZE), 0); + + zeEventHostSignal(zeEvent); + + urQueueFinish(queue); + + ASSERT_EQ(memcmp(src, dst, 4096), 0); + + free(src); + free(dst); + free(dst2); + urEventRelease(urEvent); + urEventRelease(waitEvent); + urEventRelease(memcpyEvent); + urEventRelease(memcpyEvent2); + urEventRelease(memcpyEvent3); + zeEventDestroy(zeEvent); + zeEventPoolDestroy(pool); +} diff --git a/test/adapters/level_zero/urKernelCreateWithNativeHandle.cpp b/test/adapters/level_zero/urKernelCreateWithNativeHandle.cpp index 6ee49dbbfb..b3918c7818 100644 --- a/test/adapters/level_zero/urKernelCreateWithNativeHandle.cpp +++ b/test/adapters/level_zero/urKernelCreateWithNativeHandle.cpp @@ -24,7 +24,8 @@ TEST_P(urLevelZeroKernelNativeHandleTest, OwnedHandleRelease) { auto kernel_name = uur::KernelsEnvironment::instance->GetEntryPointNames("foo")[0]; - ze_module_desc_t moduleDesc = {ZE_STRUCTURE_TYPE_MODULE_DESC}; + ze_module_desc_t moduleDesc{}; + moduleDesc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC; moduleDesc.format = ZE_MODULE_FORMAT_IL_SPIRV; moduleDesc.inputSize = il_binary->size(); moduleDesc.pInputModule = @@ -36,7 +37,8 @@ TEST_P(urLevelZeroKernelNativeHandleTest, OwnedHandleRelease) { &module, NULL), ZE_RESULT_SUCCESS); - ze_kernel_desc_t kernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC}; + ze_kernel_desc_t kernelDesc{}; + kernelDesc.stype = ZE_STRUCTURE_TYPE_KERNEL_DESC; kernelDesc.pKernelName = kernel_name.c_str(); ze_kernel_handle_t native_kernel; @@ -75,7 +77,8 @@ TEST_P(urLevelZeroKernelNativeHandleTest, NullProgram) { auto kernel_name = uur::KernelsEnvironment::instance->GetEntryPointNames("foo")[0]; - ze_module_desc_t moduleDesc = {ZE_STRUCTURE_TYPE_MODULE_DESC}; + ze_module_desc_t moduleDesc{}; + moduleDesc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC; moduleDesc.format = ZE_MODULE_FORMAT_IL_SPIRV; moduleDesc.inputSize = il_binary->size(); moduleDesc.pInputModule = @@ -87,7 +90,8 @@ TEST_P(urLevelZeroKernelNativeHandleTest, NullProgram) { &module, NULL), ZE_RESULT_SUCCESS); - ze_kernel_desc_t kernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC}; + ze_kernel_desc_t kernelDesc{}; + kernelDesc.stype = ZE_STRUCTURE_TYPE_KERNEL_DESC; kernelDesc.pKernelName = kernel_name.c_str(); ze_kernel_handle_t native_kernel; diff --git a/test/adapters/level_zero/v2/CMakeLists.txt b/test/adapters/level_zero/v2/CMakeLists.txt index d5ec446323..f6fa03bd6a 100644 --- a/test/adapters/level_zero/v2/CMakeLists.txt +++ b/test/adapters/level_zero/v2/CMakeLists.txt @@ -8,8 +8,7 @@ function(add_unittest name) add_adapter_test(${name} FIXTURE DEVICES ENVIRONMENT - "UR_ADAPTERS_FORCE_LOAD=\"$\"" - "UR_L0_USE_QUEUE_V2=1" + "UR_ADAPTERS_FORCE_LOAD=\"$\"" SOURCES ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/common.cpp ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/ur_level_zero.cpp @@ -17,12 +16,13 @@ function(add_unittest name) target_include_directories(${target} PUBLIC ${PROJECT_SOURCE_DIR}/source - ${PROJECT_SOURCE_DIR}/source/adapters/level_zero + ${PROJECT_SOURCE_DIR}/source/adapters ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/v2 LevelZeroLoader-Headers) target_link_libraries(${target} PRIVATE ${PROJECT_NAME}::common + ${PROJECT_NAME}::umf LevelZeroLoader LevelZeroLoader-Headers ) @@ -35,9 +35,22 @@ add_unittest(level_zero_command_list_cache add_unittest(level_zero_event_pool event_pool_test.cpp + ${PROJECT_SOURCE_DIR}/source/ur/ur.cpp + ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/adapter.cpp + ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/device.cpp + ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/platform.cpp ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/v2/event_pool.cpp ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/v2/event_pool_cache.cpp ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/v2/event_provider_normal.cpp ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/v2/event_provider_counter.cpp ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/v2/event.cpp ) + +add_adapter_test(level_zero_memory_residency + FIXTURE DEVICES + SOURCES + memory_residency.cpp + ENVIRONMENT + "UR_ADAPTERS_FORCE_LOAD=\"$\"" + "ZES_ENABLE_SYSMAN=1" +) diff --git a/test/adapters/level_zero/v2/command_list_cache_test.cpp b/test/adapters/level_zero/v2/command_list_cache_test.cpp index b8c7244352..44755b699e 100644 --- a/test/adapters/level_zero/v2/command_list_cache_test.cpp +++ b/test/adapters/level_zero/v2/command_list_cache_test.cpp @@ -4,11 +4,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "command_list_cache.hpp" -#include "common.hpp" -#include "v2/context.hpp" - #include "context.hpp" -#include "device.hpp" + +#include "level_zero/common.hpp" +#include "level_zero/device.hpp" #include "uur/fixtures.h" #include "uur/raii.h" @@ -24,7 +23,7 @@ struct CommandListCacheTest : public uur::urContextTest {}; UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(CommandListCacheTest); TEST_P(CommandListCacheTest, CanStoreAndRetriveImmediateAndRegularCmdLists) { - v2::command_list_cache_t cache(context->ZeContext); + v2::command_list_cache_t cache(context->getZeHandle()); bool IsInOrder = false; uint32_t Ordinal = 0; @@ -76,7 +75,7 @@ TEST_P(CommandListCacheTest, CanStoreAndRetriveImmediateAndRegularCmdLists) { } TEST_P(CommandListCacheTest, ImmediateCommandListsHaveProperAttributes) { - v2::command_list_cache_t cache(context->ZeContext); + v2::command_list_cache_t cache(context->getZeHandle()); uint32_t numQueueGroups = 0; ASSERT_EQ(zeDeviceGetCommandQueueGroupProperties(device->ZeDevice, @@ -193,29 +192,23 @@ TEST_P(CommandListCacheTest, CommandListsAreReusedByQueues) { QueueProps.pNext = &IndexProps; } - ur_queue_handle_t Queue; - ASSERT_EQ( - urQueueCreate(context, device, &QueueProps, &Queue), - UR_RESULT_SUCCESS); + uur::raii::Queue Queue; + ASSERT_EQ(urQueueCreate(context, device, &QueueProps, + Queue.ptr()), + UR_RESULT_SUCCESS); Queues.emplace_back(Queue); } } - ASSERT_EQ(static_cast(context) - ->commandListCache.getNumImmediateCommandLists(), - 0); - ASSERT_EQ(static_cast(context) - ->commandListCache.getNumRegularCommandLists(), + ASSERT_EQ(context->commandListCache.getNumImmediateCommandLists(), 0); + ASSERT_EQ(context->commandListCache.getNumRegularCommandLists(), 0); } // Queues scope - ASSERT_EQ(static_cast(context) - ->commandListCache.getNumImmediateCommandLists(), + ASSERT_EQ(context->commandListCache.getNumImmediateCommandLists(), NumUniqueQueueTypes * 2); // * 2 for compute and copy - ASSERT_EQ(static_cast(context) - ->commandListCache.getNumRegularCommandLists(), - 0); + ASSERT_EQ(context->commandListCache.getNumRegularCommandLists(), 0); } } @@ -241,9 +234,9 @@ TEST_P(CommandListCacheTest, CommandListsCacheIsThreadSafe) { urQueueCreate(context, device, &QueueProps, Queue.ptr()), UR_RESULT_SUCCESS); - ASSERT_LE(static_cast(context) - ->commandListCache.getNumImmediateCommandLists(), - NumThreads * 2); // * 2 for compute and copy + ASSERT_LE( + context->commandListCache.getNumImmediateCommandLists(), + NumThreads * 2); // * 2 for compute and copy } }); } @@ -252,7 +245,6 @@ TEST_P(CommandListCacheTest, CommandListsCacheIsThreadSafe) { Thread.join(); } - ASSERT_LE(static_cast(context) - ->commandListCache.getNumImmediateCommandLists(), + ASSERT_LE(context->commandListCache.getNumImmediateCommandLists(), NumThreads * 2); } diff --git a/test/adapters/level_zero/v2/event_pool_test.cpp b/test/adapters/level_zero/v2/event_pool_test.cpp index b4f7e46f11..9443e8fa7a 100644 --- a/test/adapters/level_zero/v2/event_pool_test.cpp +++ b/test/adapters/level_zero/v2/event_pool_test.cpp @@ -4,11 +4,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "command_list_cache.hpp" -#include "common.hpp" -#include "context.hpp" -#include "device.hpp" +#include "level_zero/common.hpp" +#include "level_zero/device.hpp" +#include "context.hpp" #include "event_pool.hpp" #include "event_pool_cache.hpp" #include "event_provider.hpp" @@ -42,15 +42,21 @@ static const char *provider_to_str(ProviderType p) { } } -static const char *event_to_str(event_type e) { - switch (e) { - case EVENT_REGULAR: - return "EVENT_REGULAR"; - case EVENT_COUNTER: - return "EVENT_COUNTER"; - default: - return nullptr; +static std::string flags_to_str(event_flags_t flags) { + std::string str; + if (flags & EVENT_FLAGS_COUNTER) { + str += "provider_counter"; + } else { + str += "provider_normal"; } + + if (flags & EVENT_FLAGS_PROFILING_ENABLED) { + str += "_profiling"; + } else { + str += "_no_profiling"; + } + + return str; } static const char *queue_to_str(queue_type e) { @@ -66,7 +72,7 @@ static const char *queue_to_str(queue_type e) { struct ProviderParams { ProviderType provider; - v2::event_type event; + event_flags_t flags; v2::queue_type queue; }; @@ -81,7 +87,7 @@ printParams(const testing::TestParamInfo &info) { std::ostringstream params_stream; params_stream << platform_device_name << "__" << provider_to_str(params.provider) << "_" - << event_to_str(params.event) << "_" + << flags_to_str(params.flags) << "_" << queue_to_str(params.queue); return params_stream.str(); } @@ -94,7 +100,8 @@ struct EventPoolTest : public uur::urContextTestWithParam { cache = std::unique_ptr(new event_pool_cache( MAX_DEVICES, - [this, params](DeviceId) -> std::unique_ptr { + [this, params](DeviceId, event_flags_t flags) + -> std::unique_ptr { // normally id would be used to find the appropriate device to create the provider switch (params.provider) { case TEST_PROVIDER_COUNTER: @@ -102,7 +109,7 @@ struct EventPoolTest : public uur::urContextTestWithParam { device); case TEST_PROVIDER_NORMAL: return std::make_unique( - context, device, params.event, params.queue); + context, device, params.queue, flags); } return nullptr; })); @@ -116,9 +123,11 @@ struct EventPoolTest : public uur::urContextTestWithParam { }; static ProviderParams test_cases[] = { - {TEST_PROVIDER_NORMAL, EVENT_REGULAR, QUEUE_REGULAR}, - {TEST_PROVIDER_NORMAL, EVENT_COUNTER, QUEUE_REGULAR}, - {TEST_PROVIDER_NORMAL, EVENT_COUNTER, QUEUE_IMMEDIATE}, + {TEST_PROVIDER_NORMAL, 0, QUEUE_REGULAR}, + {TEST_PROVIDER_NORMAL, EVENT_FLAGS_COUNTER, QUEUE_REGULAR}, + {TEST_PROVIDER_NORMAL, EVENT_FLAGS_COUNTER, QUEUE_IMMEDIATE}, + {TEST_PROVIDER_NORMAL, EVENT_FLAGS_COUNTER | EVENT_FLAGS_PROFILING_ENABLED, + QUEUE_IMMEDIATE}, // TODO: counter provided is not fully unimplemented // counter-based provider ignores event and queue type //{TEST_PROVIDER_COUNTER, EVENT_COUNTER, QUEUE_IMMEDIATE}, @@ -128,31 +137,33 @@ UUR_TEST_SUITE_P(EventPoolTest, testing::ValuesIn(test_cases), printParams); TEST_P(EventPoolTest, InvalidDevice) { - auto pool = cache->borrow(MAX_DEVICES); + auto pool = cache->borrow(MAX_DEVICES, getParam().flags); ASSERT_EQ(pool, nullptr); - pool = cache->borrow(MAX_DEVICES + 10); + pool = cache->borrow(MAX_DEVICES + 10, getParam().flags); ASSERT_EQ(pool, nullptr); } TEST_P(EventPoolTest, Basic) { { - ur_event *first; + ur_event_handle_t first; ze_event_handle_t zeFirst; { - auto pool = cache->borrow(device->Id); + auto pool = cache->borrow(device->Id.value(), getParam().flags); first = pool->allocate(); zeFirst = first->getZeEvent(); - pool->free(first); + + urEventRelease(first); } - ur_event *second; + ur_event_handle_t second; ze_event_handle_t zeSecond; { - auto pool = cache->borrow(device->Id); + auto pool = cache->borrow(device->Id.value(), getParam().flags); second = pool->allocate(); zeSecond = second->getZeEvent(); - pool->free(second); + + urEventRelease(second); } ASSERT_EQ(first, second); ASSERT_EQ(zeFirst, zeSecond); @@ -165,13 +176,13 @@ TEST_P(EventPoolTest, Threaded) { for (int iters = 0; iters < 3; ++iters) { for (int th = 0; th < 10; ++th) { threads.emplace_back([&] { - auto pool = cache->borrow(device->Id); - std::vector events; + auto pool = cache->borrow(device->Id.value(), getParam().flags); + std::vector events; for (int i = 0; i < 100; ++i) { events.push_back(pool->allocate()); } for (int i = 0; i < 100; ++i) { - pool->free(events[i]); + urEventRelease(events[i]); } }); } @@ -183,14 +194,14 @@ TEST_P(EventPoolTest, Threaded) { } TEST_P(EventPoolTest, ProviderNormalUseMostFreePool) { - auto pool = cache->borrow(device->Id); - std::list events; + auto pool = cache->borrow(device->Id.value(), getParam().flags); + std::list events; for (int i = 0; i < 128; ++i) { events.push_back(pool->allocate()); } auto frontZeHandle = events.front()->getZeEvent(); for (int i = 0; i < 8; ++i) { - pool->free(events.front()); + urEventRelease(events.front()); events.pop_front(); } for (int i = 0; i < 8; ++i) { @@ -202,6 +213,6 @@ TEST_P(EventPoolTest, ProviderNormalUseMostFreePool) { ASSERT_EQ(frontZeHandle, events.back()->getZeEvent()); for (auto e : events) { - pool->free(e); + urEventRelease(e); } } diff --git a/test/adapters/level_zero/v2/memory_residency.cpp b/test/adapters/level_zero/v2/memory_residency.cpp new file mode 100644 index 0000000000..b1e2b561a5 --- /dev/null +++ b/test/adapters/level_zero/v2/memory_residency.cpp @@ -0,0 +1,48 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "ur_print.hpp" +#include "uur/fixtures.h" +#include "uur/raii.h" +#include "uur/utils.h" + +#include +#include + +using urMemoryResidencyTest = uur::urMultiDeviceContextTestTemplate<1>; + +TEST_F(urMemoryResidencyTest, allocatingDeviceMemoryWillResultInOOM) { + static constexpr size_t allocSize = 1024 * 1024; + + if (!uur::isPVC(uur::DevicesEnvironment::instance->devices[0])) { + GTEST_SKIP() << "Test requires a PVC device"; + } + + size_t initialMemFree = 0; + ASSERT_SUCCESS( + urDeviceGetInfo(uur::DevicesEnvironment::instance->devices[0], + UR_DEVICE_INFO_GLOBAL_MEM_FREE, sizeof(size_t), + &initialMemFree, nullptr)); + + if (initialMemFree < allocSize) { + GTEST_SKIP() << "Not enough device memory available"; + } + + void *ptr = nullptr; + ASSERT_SUCCESS( + urUSMDeviceAlloc(context, uur::DevicesEnvironment::instance->devices[0], + nullptr, nullptr, allocSize, &ptr)); + + size_t currentMemFree = 0; + ASSERT_SUCCESS( + urDeviceGetInfo(uur::DevicesEnvironment::instance->devices[0], + UR_DEVICE_INFO_GLOBAL_MEM_FREE, sizeof(size_t), + ¤tMemFree, nullptr)); + + // amount of free memory should decrease after making a memory allocation resident + ASSERT_LE(currentMemFree, initialMemFree); + + ASSERT_SUCCESS(urUSMFree(context, ptr)); +} diff --git a/test/adapters/level_zero/zeCallMap.cpp b/test/adapters/level_zero/zeCallMap.cpp index 3c6487f36d..c2e47b856d 100644 --- a/test/adapters/level_zero/zeCallMap.cpp +++ b/test/adapters/level_zero/zeCallMap.cpp @@ -9,4 +9,5 @@ // Map used by L0 adapter to count the number of calls to each L0 function // Lifetime is managed by the adapter, this variable is defined here // only so that we can read it from the tests. -std::map *ZeCallCount = nullptr; +__attribute__((visibility("default"))) std::map *ZeCallCount = + nullptr; diff --git a/test/conformance/CMakeLists.txt b/test/conformance/CMakeLists.txt index 894ff93632..cfc4725837 100644 --- a/test/conformance/CMakeLists.txt +++ b/test/conformance/CMakeLists.txt @@ -9,47 +9,47 @@ set(UR_CONFORMANCE_DEVICE_BINARIES_DIR "${CMAKE_CURRENT_BINARY_DIR}/device_binaries" CACHE INTERNAL "Internal cache variable for device binaries directory") -function(add_test_adapter name adapter) - if(NOT "${ARGN}" STREQUAL "") - set(EXTRA_NAME "-${ARGN}") - endif() +function(add_test_adapter name adapter backend) set(TEST_TARGET_NAME test-${name}) - set(TEST_NAME ${name}-${adapter}${EXTRA_NAME}) + set(TEST_NAME ${name}-${adapter}) set(TEST_COMMAND - "${PROJECT_BINARY_DIR}/bin/${TEST_TARGET_NAME} --test_devices_count=${UR_TEST_DEVICES_COUNT} --test_platforms_count=${UR_TEST_PLATFORMS_COUNT}" + "${PROJECT_BINARY_DIR}/bin/${TEST_TARGET_NAME} --backend=${backend} --devices_count=${UR_TEST_DEVICES_COUNT} --platforms_count=${UR_TEST_PLATFORMS_COUNT}" ) + set(MATCH_FILE "${CMAKE_CURRENT_SOURCE_DIR}/${name}_${adapter}.match") + + function(do_add_test tname env) + if(${UR_CONFORMANCE_ENABLE_MATCH_FILES} AND EXISTS ${MATCH_FILE}) + add_test(NAME ${tname} + COMMAND ${Python3_EXECUTABLE} ${UR_CONFORMANCE_TEST_DIR}/cts_exe.py + --failslist ${MATCH_FILE} + --test_command ${PROJECT_BINARY_DIR}/bin/${TEST_TARGET_NAME} + -- + --backend=${backend} + --devices_count=${UR_TEST_DEVICES_COUNT} + --platforms_count=${UR_TEST_PLATFORMS_COUNT} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) + else() + separate_arguments(TEST_COMMAND) + add_test(NAME ${tname} + COMMAND ${TEST_COMMAND} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) + endif() - if(UR_CONFORMANCE_ENABLE_MATCH_FILES) - add_test(NAME ${TEST_NAME} - COMMAND ${CMAKE_COMMAND} - -D TEST_FILE=${Python3_EXECUTABLE} - -D TEST_ARGS="${UR_CONFORMANCE_TEST_DIR}/cts_exe.py --test_command ${TEST_COMMAND}" - -D MODE=stdout - -D MATCH_FILE=${CMAKE_CURRENT_SOURCE_DIR}/${name}_${adapter}${EXTRA_NAME}.match - -P ${PROJECT_SOURCE_DIR}/cmake/match.cmake - DEPENDS ${TEST_TARGET_NAME} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - ) - else() - separate_arguments(TEST_COMMAND) - add_test(NAME ${TEST_NAME} - COMMAND ${TEST_COMMAND} - DEPENDS ${TEST_TARGET_NAME} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - ) - endif() - - set(TEST_ENV UR_ADAPTERS_FORCE_LOAD="$") - if("${ARGN}" STREQUAL "v2") - list(APPEND TEST_ENV "UR_L0_USE_QUEUE_V2=1") - endif() - if(UR_CONFORMANCE_ENABLE_MATCH_FILES) - list(APPEND TEST_ENV GTEST_COLOR=no) + if(UR_CONFORMANCE_ENABLE_MATCH_FILES) + list(APPEND env GTEST_COLOR=yes) + endif() + set_tests_properties(${tname} PROPERTIES + ENVIRONMENT "${env}" + LABELS "conformance;${adapter}") + endfunction() + + do_add_test(${TEST_NAME} UR_ADAPTERS_FORCE_LOAD="$") + if(UR_CONFORMANCE_TEST_LOADER) + do_add_test(${TEST_NAME}-loader "") endif() - set_tests_properties(${TEST_NAME} PROPERTIES - ENVIRONMENT "${TEST_ENV}" - LABELS "conformance;${adapter}") endfunction() function(add_conformance_test name) @@ -68,26 +68,22 @@ function(add_conformance_test name) unit_tests_helpers) if(UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_ALL) - add_test_adapter(${name} adapter_cuda) + add_test_adapter(${name} adapter_cuda CUDA) endif() if(UR_BUILD_ADAPTER_HIP OR UR_BUILD_ADAPTER_ALL) - add_test_adapter(${name} adapter_hip) + add_test_adapter(${name} adapter_hip HIP) endif() if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_ALL) - add_test_adapter(${name} adapter_level_zero) - add_test_adapter(${name} adapter_level_zero v2) + add_test_adapter(${name} adapter_level_zero LEVEL_ZERO) + endif() + if(UR_BUILD_ADAPTER_L0_V2) + add_test_adapter(${name} adapter_level_zero_v2 LEVEL_ZERO) endif() if(UR_BUILD_ADAPTER_OPENCL OR UR_BUILD_ADAPTER_ALL) - add_test_adapter(${name} adapter_opencl) + add_test_adapter(${name} adapter_opencl OPENCL) endif() if(UR_BUILD_ADAPTER_NATIVE_CPU OR UR_BUILD_ADAPTER_ALL) - add_test_adapter(${name} adapter_native_cpu) - endif() - - if(NOT (UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_HIP - OR UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_OPENCL - OR UR_BUILD_ADAPTER_NATIVE_CPU OR UR_BUILD_ADAPTER_ALL)) - add_test_adapter(${name} adapter_mock) + add_test_adapter(${name} adapter_native_cpu NATIVE_CPU) endif() endfunction() @@ -122,6 +118,19 @@ add_subdirectory(queue) add_subdirectory(sampler) add_subdirectory(virtual_memory) +set(TEST_SUBDIRECTORIES_DPCXX + "device_code" + "kernel" + "program" + "enqueue" + "integration" + "exp_command_buffer" + "exp_enqueue_native" + "exp_usm_p2p" + "exp_launch_properties" + "memory-migrate" +) + if(UR_DPCXX) add_custom_target(generate_device_binaries) @@ -130,7 +139,7 @@ if(UR_DPCXX) file(MAKE_DIRECTORY ${UR_CONFORMANCE_DEVICE_BINARIES_DIR}) if("${UR_CONFORMANCE_TARGET_TRIPLES}" STREQUAL "") - if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_OPENCL OR UR_BUILD_ADAPTER_ALL) + if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_L0_V2 OR UR_BUILD_ADAPTER_OPENCL OR UR_BUILD_ADAPTER_ALL) list(APPEND TARGET_TRIPLES "spir64") endif() if(UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_ALL) @@ -143,18 +152,20 @@ if(UR_DPCXX) string(REPLACE "," ";" TARGET_TRIPLES ${UR_CONFORMANCE_TARGET_TRIPLES}) endif() - add_subdirectory(device_code) - add_subdirectory(kernel) - add_subdirectory(program) - add_subdirectory(enqueue) - add_subdirectory(integration) - add_subdirectory(exp_command_buffer) - add_subdirectory(exp_enqueue_native) - add_subdirectory(exp_usm_p2p) - add_subdirectory(exp_launch_properties) - add_subdirectory(memory-migrate) + foreach(dir ${TEST_SUBDIRECTORIES_DPCXX}) + add_subdirectory(${dir}) + endforeach() else() + set(DISABLED_TESTS "") + foreach(dir ${TEST_SUBDIRECTORIES_DPCXX}) + if(NOT dir STREQUAL "device_code") + list(APPEND DISABLED_TESTS "test-${dir}") + endif() + endforeach() + + string(REPLACE ";" ", " DISABLED_TESTS_STR "${DISABLED_TESTS}") + message(WARNING "UR_DPCXX is not defined, the following conformance test executables \ - are disabled: test-program, test-kernel, test-enqueue") + are disabled: ${DISABLED_TESTS_STR}") endif() diff --git a/test/conformance/adapter/adapter_adapter_cuda.match b/test/conformance/adapter/adapter_adapter_cuda.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/adapter/adapter_adapter_hip.match b/test/conformance/adapter/adapter_adapter_hip.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/adapter/adapter_adapter_level_zero-v2.match b/test/conformance/adapter/adapter_adapter_level_zero-v2.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/adapter/adapter_adapter_level_zero.match b/test/conformance/adapter/adapter_adapter_level_zero.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/adapter/adapter_adapter_native_cpu.match b/test/conformance/adapter/adapter_adapter_native_cpu.match index 5bc60575e2..1335caf904 100644 --- a/test/conformance/adapter/adapter_adapter_native_cpu.match +++ b/test/conformance/adapter/adapter_adapter_native_cpu.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} urAdapterGetLastErrorTest.Success urAdapterGetLastErrorTest.InvalidHandle urAdapterGetLastErrorTest.InvalidMessagePtr diff --git a/test/conformance/adapter/adapter_adapter_opencl.match b/test/conformance/adapter/adapter_adapter_opencl.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/adapter/urAdapterGet.cpp b/test/conformance/adapter/urAdapterGet.cpp index 2c3b62a620..24d1a33057 100644 --- a/test/conformance/adapter/urAdapterGet.cpp +++ b/test/conformance/adapter/urAdapterGet.cpp @@ -18,5 +18,6 @@ TEST_F(urAdapterGetTest, InvalidNumEntries) { uint32_t adapter_count; ASSERT_SUCCESS(urAdapterGet(0, nullptr, &adapter_count)); std::vector adapters(adapter_count); - ASSERT_SUCCESS(urAdapterGet(0, adapters.data(), nullptr)); + ASSERT_EQ(urAdapterGet(0, adapters.data(), nullptr), + UR_RESULT_ERROR_INVALID_SIZE); } diff --git a/test/conformance/adapter/urAdapterGetInfo.cpp b/test/conformance/adapter/urAdapterGetInfo.cpp index 6eea5182d8..4dff3ce4dc 100644 --- a/test/conformance/adapter/urAdapterGetInfo.cpp +++ b/test/conformance/adapter/urAdapterGetInfo.cpp @@ -87,3 +87,21 @@ TEST_F(urAdapterGetInfoTest, InvalidNullPointerPropSizeRet) { urAdapterGetInfo(adapter, UR_ADAPTER_INFO_BACKEND, 0, nullptr, nullptr), UR_RESULT_ERROR_INVALID_NULL_POINTER); } + +TEST_F(urAdapterGetInfoTest, ReferenceCountNotZero) { + uint32_t referenceCount = 0; + + ASSERT_SUCCESS(urAdapterGetInfo(adapter, UR_ADAPTER_INFO_REFERENCE_COUNT, + sizeof(referenceCount), &referenceCount, + nullptr)); + ASSERT_GT(referenceCount, 0); +} + +TEST_F(urAdapterGetInfoTest, ValidAdapterBackend) { + ur_adapter_backend_t backend; + ASSERT_SUCCESS(urAdapterGetInfo(adapter, UR_ADAPTER_INFO_BACKEND, + sizeof(backend), &backend, nullptr)); + + ASSERT_TRUE(backend >= UR_ADAPTER_BACKEND_LEVEL_ZERO && + backend <= UR_ADAPTER_BACKEND_NATIVE_CPU); +} diff --git a/test/conformance/adapter/urAdapterRelease.cpp b/test/conformance/adapter/urAdapterRelease.cpp index e7c5bd11ce..8b29fa0f2c 100644 --- a/test/conformance/adapter/urAdapterRelease.cpp +++ b/test/conformance/adapter/urAdapterRelease.cpp @@ -15,8 +15,19 @@ struct urAdapterReleaseTest : uur::runtime::urAdapterTest { }; TEST_F(urAdapterReleaseTest, Success) { - ASSERT_SUCCESS(urAdapterRetain(adapter)); + uint32_t referenceCountBefore = 0; + + ASSERT_SUCCESS(urAdapterGetInfo(adapter, UR_ADAPTER_INFO_REFERENCE_COUNT, + sizeof(referenceCountBefore), + &referenceCountBefore, nullptr)); + + uint32_t referenceCountAfter = 0; EXPECT_SUCCESS(urAdapterRelease(adapter)); + ASSERT_SUCCESS(urAdapterGetInfo(adapter, UR_ADAPTER_INFO_REFERENCE_COUNT, + sizeof(referenceCountAfter), + &referenceCountAfter, nullptr)); + + ASSERT_LE(referenceCountAfter, referenceCountBefore); } TEST_F(urAdapterReleaseTest, InvalidNullHandleAdapter) { diff --git a/test/conformance/adapter/urAdapterRetain.cpp b/test/conformance/adapter/urAdapterRetain.cpp index 2a5efd0344..86967b983b 100644 --- a/test/conformance/adapter/urAdapterRetain.cpp +++ b/test/conformance/adapter/urAdapterRetain.cpp @@ -15,8 +15,19 @@ struct urAdapterRetainTest : uur::runtime::urAdapterTest { }; TEST_F(urAdapterRetainTest, Success) { - ASSERT_SUCCESS(urAdapterRetain(adapter)); - EXPECT_SUCCESS(urAdapterRelease(adapter)); + uint32_t referenceCountBefore = 0; + + ASSERT_SUCCESS(urAdapterGetInfo(adapter, UR_ADAPTER_INFO_REFERENCE_COUNT, + sizeof(referenceCountBefore), + &referenceCountBefore, nullptr)); + + uint32_t referenceCountAfter = 0; + EXPECT_SUCCESS(urAdapterRetain(adapter)); + ASSERT_SUCCESS(urAdapterGetInfo(adapter, UR_ADAPTER_INFO_REFERENCE_COUNT, + sizeof(referenceCountAfter), + &referenceCountAfter, nullptr)); + + ASSERT_GT(referenceCountAfter, referenceCountBefore); } TEST_F(urAdapterRetainTest, InvalidNullHandleAdapter) { diff --git a/test/conformance/context/context_adapter_cuda.match b/test/conformance/context/context_adapter_cuda.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/context/context_adapter_hip.match b/test/conformance/context/context_adapter_hip.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/context/context_adapter_level_zero-v2.match b/test/conformance/context/context_adapter_level_zero-v2.match deleted file mode 100644 index f25df872a3..0000000000 --- a/test/conformance/context/context_adapter_level_zero-v2.match +++ /dev/null @@ -1,2 +0,0 @@ -urContextCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urContextSetExtendedDeleterTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ diff --git a/test/conformance/context/context_adapter_level_zero.match b/test/conformance/context/context_adapter_level_zero.match index f25df872a3..c36611b9a5 100644 --- a/test/conformance/context/context_adapter_level_zero.match +++ b/test/conformance/context/context_adapter_level_zero.match @@ -1,2 +1,2 @@ -urContextCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +{{NONDETERMINISTIC}} urContextSetExtendedDeleterTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ diff --git a/test/conformance/context/context_adapter_level_zero_v2.match b/test/conformance/context/context_adapter_level_zero_v2.match new file mode 100644 index 0000000000..2e6ea80468 --- /dev/null +++ b/test/conformance/context/context_adapter_level_zero_v2.match @@ -0,0 +1 @@ +urContextSetExtendedDeleterTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}__ diff --git a/test/conformance/context/context_adapter_native_cpu.match b/test/conformance/context/context_adapter_native_cpu.match index 2ad35ad411..3f80da7c36 100644 --- a/test/conformance/context/context_adapter_native_cpu.match +++ b/test/conformance/context/context_adapter_native_cpu.match @@ -1,3 +1 @@ -urContextCreateWithNativeHandleTest.InvalidNullHandleAdapter/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -urContextCreateWithNativeHandleTest.InvalidNullPointerContext/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} urContextSetExtendedDeleterTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} diff --git a/test/conformance/context/context_adapter_opencl.match b/test/conformance/context/context_adapter_opencl.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/context/urContextCreate.cpp b/test/conformance/context/urContextCreate.cpp index d5fb59389f..0f268a3992 100644 --- a/test/conformance/context/urContextCreate.cpp +++ b/test/conformance/context/urContextCreate.cpp @@ -17,7 +17,8 @@ TEST_P(urContextCreateTest, Success) { } TEST_P(urContextCreateTest, SuccessWithProperties) { - ur_context_properties_t properties{UR_STRUCTURE_TYPE_CONTEXT_PROPERTIES}; + ur_context_properties_t properties{UR_STRUCTURE_TYPE_CONTEXT_PROPERTIES, + nullptr, 0}; uur::raii::Context context = nullptr; ASSERT_SUCCESS(urContextCreate(1, &device, &properties, context.ptr())); ASSERT_NE(nullptr, context); diff --git a/test/conformance/context/urContextCreateWithNativeHandle.cpp b/test/conformance/context/urContextCreateWithNativeHandle.cpp index d33c9e69a0..9b1c61f14a 100644 --- a/test/conformance/context/urContextCreateWithNativeHandle.cpp +++ b/test/conformance/context/urContextCreateWithNativeHandle.cpp @@ -46,11 +46,6 @@ TEST_P(urContextCreateWithNativeHandleTest, SuccessWithOwnedNativeHandle) { UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urContextCreateWithNativeHandle( native_context, adapter, 1, &device, &props, &ctx)); ASSERT_NE(ctx, nullptr); - - uint32_t ref_count = 0; - ASSERT_SUCCESS(urContextGetInfo(ctx, UR_CONTEXT_INFO_REFERENCE_COUNT, - sizeof(uint32_t), &ref_count, nullptr)); - ASSERT_EQ(ref_count, 1); } TEST_P(urContextCreateWithNativeHandleTest, SuccessWithUnOwnedNativeHandle) { @@ -66,18 +61,12 @@ TEST_P(urContextCreateWithNativeHandleTest, SuccessWithUnOwnedNativeHandle) { UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urContextCreateWithNativeHandle( native_context, adapter, 1, &device, &props, &ctx)); ASSERT_NE(ctx, nullptr); - - uint32_t ref_count = 0; - ASSERT_SUCCESS(urContextGetInfo(ctx, UR_CONTEXT_INFO_REFERENCE_COUNT, - sizeof(uint32_t), &ref_count, nullptr)); - ASSERT_EQ(ref_count, 2); - - ASSERT_SUCCESS(urContextRelease(ctx)); } TEST_P(urContextCreateWithNativeHandleTest, InvalidNullHandleAdapter) { ur_native_handle_t native_context = 0; - ASSERT_SUCCESS(urContextGetNativeHandle(context, &native_context)); + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urContextGetNativeHandle(context, &native_context)); ur_context_handle_t ctx = nullptr; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE, @@ -87,7 +76,8 @@ TEST_P(urContextCreateWithNativeHandleTest, InvalidNullHandleAdapter) { TEST_P(urContextCreateWithNativeHandleTest, InvalidNullPointerContext) { ur_native_handle_t native_context = 0; - ASSERT_SUCCESS(urContextGetNativeHandle(context, &native_context)); + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urContextGetNativeHandle(context, &native_context)); ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, urContextCreateWithNativeHandle(native_context, adapter, 1, diff --git a/test/conformance/cts_exe.py b/test/conformance/cts_exe.py old mode 100644 new mode 100755 index 97ada1ba4b..b183b55d6e --- a/test/conformance/cts_exe.py +++ b/test/conformance/cts_exe.py @@ -1,6 +1,6 @@ -#! /usr/bin/env python3 +#!/usr/bin/env python3 """ - Copyright (C) 2023 Intel Corporation + Copyright (C) 2024 Intel Corporation Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT @@ -10,50 +10,172 @@ # Printing conformance test output from gtest and checking failed tests with match files. # The match files contain tests that are expected to fail. +import os import sys -from argparse import ArgumentParser +import argparse import subprocess # nosec B404 -import signal -import re -from collections import OrderedDict -if __name__ == '__main__': - parser = ArgumentParser() +def _ci(): + return os.environ.get("CI") is not None + + +def _color(): + return sys.stdout.isatty() or os.environ.get("GTEST_COLOR").lower() == "yes" + + +def _print_header(header, *args): + if _ci(): + # GitHub CI interprets this as a "group header" and will provide buttons to fold/unfold it + print("##[group]{}".format(header.format(*args))) + elif _color(): + # Inverse color + print("\033[7m{}\033[27m".format(header.format(*args))) + else: + print("### {}".format(header.format(*args))) + + +def _print_end_header(): + if _ci(): + print("##[endgroup]") + + +def _print_error(header, *args): + if _color(): + # "!!!" on a red background + print("\033[41m!!!\033[0m {}".format(header.format(*args))) + else: + print("!!! {}".format(header.format(*args))) + + +def _print_format(msg, *args): + print(msg.format(*args)) + + +def _print_environ(env): + _print_header("Environment") + for k, v in env.items(): + _print_format("> {} = {}", k, v) + _print_end_header() + + +def _check_filter(cmd, filter): + """ + Checks that the filter matches at least one test for the given cmd + """ + sys.stdout.flush() + check = subprocess.Popen( # nosec B603 + cmd + ["--gtest_list_tests"], + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + env=(os.environ | {"GTEST_FILTER": filter}), + ) + if not check.stdout.read(1): + return False + return True + + +def _run_cmd(cmd, comment, filter): + _print_header("Running suite for: {}", comment) + _print_format("### {}", " ".join(cmd)) + + # Check tests are found + if not _check_filter(cmd, filter): + _print_end_header() + _print_error("Could not find any tests with this filter") + return 2 + + sys.stdout.flush() + result = subprocess.call( # nosec B603 + cmd, + stdout=sys.stdout, + stderr=sys.stdout, + env=(os.environ | {"GTEST_FILTER": filter}), + ) + _print_end_header() + return result + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() parser.add_argument("--test_command", help="Ctest test case") - parser.add_argument("--test_devices_count", type=str, help="Number of devices on which tests will be run") - parser.add_argument("--test_platforms_count", type=str, help="Number of platforms on which tests will be run") + parser.add_argument("--failslist", type=str, help="Failure list") + parser.add_argument("--", dest="ignored", action="store_true") + parser.add_argument("rest", nargs=argparse.REMAINDER) args = parser.parse_args() - result = subprocess.Popen([args.test_command, '--gtest_brief=1', # nosec B603 - f'--devices_count={args.test_devices_count}', - f'--platforms_count={args.test_platforms_count}'], - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) - - pat = re.compile(r'\[( )*FAILED( )*\]') - output_list = [] - test_cases = [] - for line in result.stdout: - output_list.append(line) - if pat.search(line): - test_case = line.split(" ")[5] - test_case = test_case.rstrip(',') - test_cases.append(test_case) - - # Every fail has a single corresponding match line but if there are multiple - # devices being tested there will be multiple lines with the same failure - # message. To avoid matching mismatch, remove lines that differ only by device ID. - test_cases = [re.sub(r'ID[0-9]ID', 'X', tc) for tc in test_cases] - test_cases = list(OrderedDict.fromkeys(test_cases)) - - for tc in test_cases: - print(tc) - - rc = result.wait() - if rc < 0: - print(signal.strsignal(abs(rc))) - - print("#### GTEST_OUTPUT ####", file=sys.stderr) - for output in output_list: - print(output, file=sys.stderr) - print("#### GTEST_OUTPUT_END ####", file=sys.stderr) + base_invocation = [args.test_command] + args.rest + + if os.environ.get("GTEST_OUTPUT") is not None: + # We are being ran purely to generate an output file (likely for ctest_parser.py); falling back to just using + # one test execution + sys.exit( + subprocess.call( # nosec B603 + base_invocation, stdout=sys.stdout, stderr=sys.stderr + ) + ) + + _print_environ(os.environ) + + # Parse fails list + _print_format("Loading fails from {}", args.failslist) + fail_patterns = [] + expected_fail = False + with open(args.failslist) as f: + for l in f: + optional = "{{OPT}}" in l + l = l.replace("{{OPT}}", "") + l = l.replace("{{.*}}", "*") + + if l.startswith("{{Segmentation fault"): + expected_fail = True + continue + if l.startswith("#"): + continue + if l.startswith("{{NONDETERMINISTIC}}"): + continue + if l.strip() == "": + continue + + fail_patterns.append( + { + "pattern": l.strip(), + "optional": optional, + } + ) + + _print_header("Known failing tests") + for fail in fail_patterns: + _print_format("> {}", fail) + _print_end_header() + if len(fail_patterns) == 0: + _print_error( + "Fail list is empty, if there are no more failures, please remove the file" + ) + sys.exit(2) + + final_result = 0 + + # First, run all the known good tests + gtest_filter = "-" + (":".join(map(lambda x: x["pattern"], fail_patterns))) + if _check_filter(base_invocation, gtest_filter): + result = _run_cmd(base_invocation, "known good tests", gtest_filter) + if result != 0 and not expected_fail: + _print_error("Tests we expected to pass have failed") + final_result = result + else: + _print_format("Note: No tests in this suite are expected to pass") + + # Then run each known failing tests + for fail in fail_patterns: + result = _run_cmd( + base_invocation, "failing test {}".format(fail["pattern"]), fail["pattern"] + ) + + if result == 0 and not fail["optional"]: + _print_error( + "Test {} is passing when we expect it to fail!", fail["pattern"] + ) + final_result = 1 + + sys.exit(final_result) diff --git a/test/conformance/device/device_adapter_cuda.match b/test/conformance/device/device_adapter_cuda.match index 9989fbd774..ff961cc6f5 100644 --- a/test/conformance/device/device_adapter_cuda.match +++ b/test/conformance/device/device_adapter_cuda.match @@ -1,2 +1,2 @@ -urDeviceCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle +{{NONDETERMINISTIC}} {{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime diff --git a/test/conformance/device/device_adapter_hip.match b/test/conformance/device/device_adapter_hip.match index 9989fbd774..ff961cc6f5 100644 --- a/test/conformance/device/device_adapter_hip.match +++ b/test/conformance/device/device_adapter_hip.match @@ -1,2 +1,2 @@ -urDeviceCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle +{{NONDETERMINISTIC}} {{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime diff --git a/test/conformance/device/device_adapter_level_zero-v2.match b/test/conformance/device/device_adapter_level_zero-v2.match deleted file mode 100644 index 02eb10fb04..0000000000 --- a/test/conformance/device/device_adapter_level_zero-v2.match +++ /dev/null @@ -1,3 +0,0 @@ -urDeviceCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle -{{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_GLOBAL_MEM_FREE diff --git a/test/conformance/device/device_adapter_level_zero.match b/test/conformance/device/device_adapter_level_zero.match index 02eb10fb04..ff961cc6f5 100644 --- a/test/conformance/device/device_adapter_level_zero.match +++ b/test/conformance/device/device_adapter_level_zero.match @@ -1,3 +1,2 @@ -urDeviceCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle +{{NONDETERMINISTIC}} {{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_GLOBAL_MEM_FREE diff --git a/test/conformance/device/device_adapter_level_zero_v2.match b/test/conformance/device/device_adapter_level_zero_v2.match new file mode 100644 index 0000000000..2b9ecbef70 --- /dev/null +++ b/test/conformance/device/device_adapter_level_zero_v2.match @@ -0,0 +1,3 @@ +{{NONDETERMINISTIC}} +{{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime +{{OPT}}urDeviceGetInfoTest.Success/UR_DEVICE_INFO_GLOBAL_MEM_FREE diff --git a/test/conformance/device/device_adapter_native_cpu.match b/test/conformance/device/device_adapter_native_cpu.match index a590340f58..2129478fb8 100644 --- a/test/conformance/device/device_adapter_native_cpu.match +++ b/test/conformance/device/device_adapter_native_cpu.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} urDeviceCreateWithNativeHandleTest.InvalidNullHandlePlatform urDeviceCreateWithNativeHandleTest.InvalidNullPointerDevice {{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime @@ -10,7 +11,6 @@ urDeviceGetInfoTest.Success/UR_DEVICE_INFO_GLOBAL_MEM_FREE urDeviceGetInfoTest.Success/UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES urDeviceGetInfoTest.Success/UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES urDeviceGetInfoTest.Success/UR_DEVICE_INFO_IL_VERSION -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS urDeviceGetInfoTest.Success/UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS urDeviceGetInfoTest.Success/UR_DEVICE_INFO_UUID urDeviceGetInfoTest.Success/UR_DEVICE_INFO_PCI_ADDRESS diff --git a/test/conformance/device/device_adapter_opencl.match b/test/conformance/device/device_adapter_opencl.match deleted file mode 100644 index 39854cbcd3..0000000000 --- a/test/conformance/device/device_adapter_opencl.match +++ /dev/null @@ -1 +0,0 @@ -urDeviceCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle diff --git a/test/conformance/device/urDeviceCreateWithNativeHandle.cpp b/test/conformance/device/urDeviceCreateWithNativeHandle.cpp index 071183aa9b..8cffc72cf1 100644 --- a/test/conformance/device/urDeviceCreateWithNativeHandle.cpp +++ b/test/conformance/device/urDeviceCreateWithNativeHandle.cpp @@ -20,7 +20,7 @@ TEST_F(urDeviceCreateWithNativeHandleTest, Success) { // and perform some query on it to verify that it works. ur_device_handle_t dev = nullptr; UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urDeviceCreateWithNativeHandle( - native_handle, platform, nullptr, &dev)); + native_handle, adapter, nullptr, &dev)); ASSERT_NE(dev, nullptr); uint32_t dev_id = 0; @@ -41,14 +41,8 @@ TEST_F(urDeviceCreateWithNativeHandleTest, SuccessWithOwnedNativeHandle) { ur_device_native_properties_t props{ UR_STRUCTURE_TYPE_DEVICE_NATIVE_PROPERTIES, nullptr, true}; UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urDeviceCreateWithNativeHandle( - native_handle, platform, &props, &dev)); + native_handle, adapter, &props, &dev)); ASSERT_NE(dev, nullptr); - - uint32_t ref_count = 0; - ASSERT_SUCCESS(urDeviceGetInfo(dev, UR_DEVICE_INFO_REFERENCE_COUNT, - sizeof(uint32_t), &ref_count, nullptr)); - - ASSERT_EQ(ref_count, 1); } } @@ -64,14 +58,8 @@ TEST_F(urDeviceCreateWithNativeHandleTest, SuccessWithUnOwnedNativeHandle) { ur_device_native_properties_t props{ UR_STRUCTURE_TYPE_DEVICE_NATIVE_PROPERTIES, nullptr, false}; UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urDeviceCreateWithNativeHandle( - native_handle, platform, &props, &dev)); + native_handle, adapter, &props, &dev)); ASSERT_NE(dev, nullptr); - - uint32_t ref_count = 0; - ASSERT_SUCCESS(urDeviceGetInfo(dev, UR_DEVICE_INFO_REFERENCE_COUNT, - sizeof(uint32_t), &ref_count, nullptr)); - - ASSERT_EQ(ref_count, 2); } } @@ -93,7 +81,7 @@ TEST_F(urDeviceCreateWithNativeHandleTest, InvalidNullPointerDevice) { ASSERT_SUCCESS(urDeviceGetNativeHandle(device, &native_handle)); ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, - urDeviceCreateWithNativeHandle(native_handle, platform, + urDeviceCreateWithNativeHandle(native_handle, adapter, nullptr, nullptr)); } } diff --git a/test/conformance/device_code/CMakeLists.txt b/test/conformance/device_code/CMakeLists.txt index af0bc83d8a..2120d26bf3 100644 --- a/test/conformance/device_code/CMakeLists.txt +++ b/test/conformance/device_code/CMakeLists.txt @@ -141,9 +141,11 @@ add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fill.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fill_2d.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fill_3d.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fill_usm.cpp) +add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fill_usm_2d.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/foo.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/image_copy.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/inc.cpp) +add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/increment.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/mean.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/cpy_and_mult.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/cpy_and_mult_usm.cpp) @@ -156,6 +158,7 @@ add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/saxpy_usm.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/indexers_usm.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/build_failure.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fixed_wg_size.cpp) +add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/sequence.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/standard_types.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/subgroup.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/linker_error.cpp) diff --git a/test/conformance/device_code/fill_usm_2d.cpp b/test/conformance/device_code/fill_usm_2d.cpp new file mode 100644 index 0000000000..2cfba67884 --- /dev/null +++ b/test/conformance/device_code/fill_usm_2d.cpp @@ -0,0 +1,31 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +int main() { + + size_t nd_range_x = 8; + size_t nd_range_y = 8; + + auto nd_range = sycl::range<2>(nd_range_x, nd_range_y); + + std::vector A(nd_range_x * nd_range_y, 1); + uint32_t val = 42; + sycl::queue sycl_queue; + + auto work_range = sycl::nd_range<2>(nd_range, sycl::range<2>(1, 1)); + + uint32_t *data = + sycl::malloc_shared(nd_range_x * nd_range_y, sycl_queue); + sycl_queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + work_range, [data, val](sycl::nd_item<2> item_id) { + auto id = item_id.get_global_linear_id(); + data[id] = val; + }); + }); + return 0; +} diff --git a/test/conformance/device_code/increment.cpp b/test/conformance/device_code/increment.cpp new file mode 100644 index 0000000000..14094c4963 --- /dev/null +++ b/test/conformance/device_code/increment.cpp @@ -0,0 +1,20 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +int main() { + + const size_t inputSize = 1; + sycl::queue sycl_queue; + uint32_t *inputArray = sycl::malloc_shared(inputSize, sycl_queue); + + sycl_queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::range<1>(inputSize), + [=](sycl::id<1> itemID) { inputArray[itemID] += 1; }); + }); + return 0; +} diff --git a/test/conformance/device_code/indexers_usm.cpp b/test/conformance/device_code/indexers_usm.cpp index e055fa47cc..cd3b56bf0c 100644 --- a/test/conformance/device_code/indexers_usm.cpp +++ b/test/conformance/device_code/indexers_usm.cpp @@ -3,6 +3,9 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// Offsets are deprecated, but we should still test that they work +#pragma clang diagnostic ignored "-Wdeprecated-declarations" + #include int main() { diff --git a/test/conformance/device_code/linker_error.cpp b/test/conformance/device_code/linker_error.cpp index 5fc7eebf6f..8afa369bb2 100644 --- a/test/conformance/device_code/linker_error.cpp +++ b/test/conformance/device_code/linker_error.cpp @@ -3,17 +3,17 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include SYCL_EXTERNAL void this_function_does_not_exist(); int main() { - cl::sycl::queue deviceQueue; - cl::sycl::range<1> numOfItems{1}; + sycl::queue deviceQueue; + sycl::range<1> numOfItems{1}; try { - deviceQueue.submit([&](cl::sycl::handler &cgh) { - auto kern = [=](cl::sycl::id<1>) { + deviceQueue.submit([&](sycl::handler &cgh) { + auto kern = [=](sycl::id<1>) { #ifdef __SYCL_DEVICE_ONLY__ this_function_does_not_exist(); #endif diff --git a/test/conformance/device_code/sequence.cpp b/test/conformance/device_code/sequence.cpp new file mode 100644 index 0000000000..da176d315c --- /dev/null +++ b/test/conformance/device_code/sequence.cpp @@ -0,0 +1,39 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include + +class Add; +class Mul; + +int main() { + sycl::queue deviceQueue; + uint32_t val = 0; + + auto buff = sycl::buffer(&val, 1); + + deviceQueue.submit([&](sycl::handler &cgh) { + auto acc = buff.get_access(cgh); + cgh.single_task([=]() { + for (uint32_t i = 0; i < 1000; i++) { + volatile uint32_t tmp = acc[0]; + acc[0] = tmp + 1; + } + }); + }); + + deviceQueue.submit([&](sycl::handler &cgh) { + auto acc = buff.get_access(cgh); + cgh.single_task([=]() { + for (uint32_t i = 0; i < 2; i++) { + volatile uint32_t tmp = acc[0]; + acc[0] = tmp * 2; + } + }); + }); + + return 0; +} diff --git a/test/conformance/device_code/subgroup.cpp b/test/conformance/device_code/subgroup.cpp index fa4228f846..3243a0e737 100644 --- a/test/conformance/device_code/subgroup.cpp +++ b/test/conformance/device_code/subgroup.cpp @@ -11,6 +11,11 @@ struct KernelFunctor { KernelFunctor(sycl::accessor Acc) : Acc(Acc) {} + auto get(sycl::ext::oneapi::experimental::properties_tag) { + return sycl::ext::oneapi::experimental::properties{ + sycl::ext::oneapi::experimental::sub_group_size<8>}; + } + void operator()(sycl::nd_item<1> NdItem) const { auto SG = NdItem.get_sub_group(); if (NdItem.get_global_linear_id() == 0) { diff --git a/test/conformance/enqueue/CMakeLists.txt b/test/conformance/enqueue/CMakeLists.txt index 7cc68203a0..1e19658dac 100644 --- a/test/conformance/enqueue/CMakeLists.txt +++ b/test/conformance/enqueue/CMakeLists.txt @@ -9,6 +9,7 @@ add_conformance_test_with_kernels_environment(enqueue urEnqueueEventsWait.cpp urEnqueueEventsWaitWithBarrier.cpp urEnqueueKernelLaunch.cpp + urEnqueueKernelLaunchAndMemcpyInOrder.cpp urEnqueueMemBufferCopyRect.cpp urEnqueueMemBufferCopy.cpp urEnqueueMemBufferFill.cpp diff --git a/test/conformance/enqueue/enqueue_adapter_cuda.match b/test/conformance/enqueue/enqueue_adapter_cuda.match index 381612066d..40de7158d0 100644 --- a/test/conformance/enqueue/enqueue_adapter_cuda.match +++ b/test/conformance/enqueue/enqueue_adapter_cuda.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} urEnqueueKernelLaunchTest.InvalidKernelArgs/NVIDIA_CUDA_BACKEND___{{.*}}_ urEnqueueKernelLaunchKernelWgSizeTest.NonMatchingLocalSize/NVIDIA_CUDA_BACKEND___{{.*}}_ urEnqueueKernelLaunchKernelSubGroupTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_ diff --git a/test/conformance/enqueue/enqueue_adapter_hip.match b/test/conformance/enqueue/enqueue_adapter_hip.match index f602837b14..b841a25cf4 100644 --- a/test/conformance/enqueue/enqueue_adapter_hip.match +++ b/test/conformance/enqueue/enqueue_adapter_hip.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} # HIP can't check kernel arguments urEnqueueKernelLaunchTest.InvalidKernelArgs/AMD_HIP_BACKEND___{{.*}}_ urEnqueueKernelLaunchKernelWgSizeTest.NonMatchingLocalSize/AMD_HIP_BACKEND___{{.*}}_ diff --git a/test/conformance/enqueue/enqueue_adapter_level_zero-v2.match b/test/conformance/enqueue/enqueue_adapter_level_zero-v2.match deleted file mode 100644 index 75f65a68fd..0000000000 --- a/test/conformance/enqueue/enqueue_adapter_level_zero-v2.match +++ /dev/null @@ -1,515 +0,0 @@ - -urEnqueueDeviceGetGlobalVariableReadTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueEventsWaitTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueEventsWaitTest.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueEventsWaitTest.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueEventsWaitWithBarrierTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueEventsWaitWithBarrierTest.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueEventsWaitWithBarrierTest.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueKernelLaunchTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueKernelLaunchTest.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueKernelLaunchTest.InvalidWorkDimension/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueKernelLaunchTest.InvalidWorkGroupSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueKernelLaunchTest.InvalidKernelArgs/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueKernelLaunchKernelWgSizeTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueKernelLaunchKernelWgSizeTest.SuccessWithExplicitLocalSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueKernelLaunchKernelWgSizeTest.NonMatchingLocalSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueKernelLaunchKernelSubGroupTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueKernelLaunchKernelStandardTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueKernelLaunchTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_{{.*}} -urEnqueueKernelLaunchTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D_31 -urEnqueueKernelLaunchTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D_1027 -urEnqueueKernelLaunchTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D_32 -urEnqueueKernelLaunchTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D_256 -urEnqueueKernelLaunchTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D_1_1 -urEnqueueKernelLaunchTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D_31_7 -urEnqueueKernelLaunchTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D_1027_1 -urEnqueueKernelLaunchTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D_1_32 -urEnqueueKernelLaunchTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D_256_79 -urEnqueueKernelLaunchTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D_1_1_1 -urEnqueueKernelLaunchTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D_31_7_1 -urEnqueueKernelLaunchTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D_1027_1_19 -urEnqueueKernelLaunchTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D_1_53_19 -urEnqueueKernelLaunchTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D_256_79_8 -urEnqueueKernelLaunchWithVirtualMemory.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueKernelLaunchMultiDeviceTest.KernelLaunchReadDifferentQueues/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueKernelLaunchUSMLinkedList.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled -urEnqueueKernelLaunchUSMLinkedList.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled -urEnqueueMemBufferCopyRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___copy_whole_buffer_2D -urEnqueueMemBufferCopyRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___copy_non_zero_offsets_2D -urEnqueueMemBufferCopyRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___copy_different_buffer_sizes_2D -urEnqueueMemBufferCopyRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___copy_column_2D -urEnqueueMemBufferCopyRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___copy_row_2D -urEnqueueMemBufferCopyRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___copy_3d -urEnqueueMemBufferCopyRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___copy_3d_with_offsets -urEnqueueMemBufferCopyRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___copy_2d_3d -urEnqueueMemBufferCopyRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___copy_3d_2d -urEnqueueMemBufferCopyRectTest.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemBufferCopyRectTest.InvalidNullHandleBufferSrc/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemBufferCopyRectTest.InvalidNullHandleBufferDst/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemBufferCopyRectTest.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemBufferCopyRectTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemBufferCopyTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024 -urEnqueueMemBufferCopyTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500 -urEnqueueMemBufferCopyTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096 -urEnqueueMemBufferCopyTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000 -urEnqueueMemBufferCopyTestWithParam.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024 -urEnqueueMemBufferCopyTestWithParam.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500 -urEnqueueMemBufferCopyTestWithParam.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096 -urEnqueueMemBufferCopyTestWithParam.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000 -urEnqueueMemBufferCopyTestWithParam.InvalidNullHandleBufferSrc/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024 -urEnqueueMemBufferCopyTestWithParam.InvalidNullHandleBufferSrc/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500 -urEnqueueMemBufferCopyTestWithParam.InvalidNullHandleBufferSrc/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096 -urEnqueueMemBufferCopyTestWithParam.InvalidNullHandleBufferSrc/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000 -urEnqueueMemBufferCopyTestWithParam.InvalidNullHandleBufferDst/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024 -urEnqueueMemBufferCopyTestWithParam.InvalidNullHandleBufferDst/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500 -urEnqueueMemBufferCopyTestWithParam.InvalidNullHandleBufferDst/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096 -urEnqueueMemBufferCopyTestWithParam.InvalidNullHandleBufferDst/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000 -urEnqueueMemBufferCopyTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024 -urEnqueueMemBufferCopyTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500 -urEnqueueMemBufferCopyTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096 -urEnqueueMemBufferCopyTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000 -urEnqueueMemBufferCopyTestWithParam.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024 -urEnqueueMemBufferCopyTestWithParam.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500 -urEnqueueMemBufferCopyTestWithParam.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096 -urEnqueueMemBufferCopyTestWithParam.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000 -urEnqueueMemBufferFillTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__1__patternSize__1 -urEnqueueMemBufferFillTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__256 -urEnqueueMemBufferFillTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__1024__patternSize__256 -urEnqueueMemBufferFillTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__4 -urEnqueueMemBufferFillTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__8 -urEnqueueMemBufferFillTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__16 -urEnqueueMemBufferFillTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__32 -urEnqueueMemBufferFillTest.SuccessPartialFill/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__256 -urEnqueueMemBufferFillTest.SuccessPartialFill/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__1024__patternSize__256 -urEnqueueMemBufferFillTest.SuccessPartialFill/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__4 -urEnqueueMemBufferFillTest.SuccessPartialFill/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__8 -urEnqueueMemBufferFillTest.SuccessPartialFill/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__16 -urEnqueueMemBufferFillTest.SuccessPartialFill/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__32 -urEnqueueMemBufferFillTest.SuccessOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__256 -urEnqueueMemBufferFillTest.SuccessOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__1024__patternSize__256 -urEnqueueMemBufferFillTest.SuccessOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__4 -urEnqueueMemBufferFillTest.SuccessOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__8 -urEnqueueMemBufferFillTest.SuccessOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__16 -urEnqueueMemBufferFillTest.SuccessOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__32 -urEnqueueMemBufferFillNegativeTest.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemBufferMapTestWithParam.SuccessRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccessRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccessRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccessRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccessRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccessRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccessRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccessRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccessOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccessOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccessOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccessOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccessOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccessOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccessOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccessOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccessPartialMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccessPartialMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccessPartialMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccessPartialMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccessPartialMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessPartialMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessPartialMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessPartialMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessPartialMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessPartialMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessPartialMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessPartialMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessPartialMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccessPartialMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccessPartialMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccessPartialMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccesPinnedRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccesPinnedRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccesPinnedRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccesPinnedRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccesPinnedRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccesPinnedRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccesPinnedRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccesPinnedRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccesPinnedRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccesPinnedRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccesPinnedRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccesPinnedRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccesPinnedRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccesPinnedRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccesPinnedRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccesPinnedRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccesPinnedWrite/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccesPinnedWrite/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccesPinnedWrite/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccesPinnedWrite/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccesPinnedWrite/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccesPinnedWrite/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccesPinnedWrite/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccesPinnedWrite/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccesPinnedWrite/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccesPinnedWrite/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccesPinnedWrite/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccesPinnedWrite/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccesPinnedWrite/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccesPinnedWrite/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccesPinnedWrite/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccesPinnedWrite/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccessMultiMaps/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccessMultiMaps/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccessMultiMaps/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccessMultiMaps/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.SuccessMultiMaps/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessMultiMaps/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessMultiMaps/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessMultiMaps/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessMultiMaps/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessMultiMaps/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessMultiMaps/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessMultiMaps/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.SuccessMultiMaps/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccessMultiMaps/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccessMultiMaps/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.SuccessMultiMaps/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferMapTestWithWriteFlagParam.SuccessWrite/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_MAP_FLAG_WRITE -urEnqueueMemBufferMapTestWithWriteFlagParam.SuccessWrite/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_MAP_FLAG_WRITE_INVALIDATE_REGION -urEnqueueMemBufferReadTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferReadTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferReadTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferReadTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferReadTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferReadTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferReadTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferReadTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferReadTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferReadTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferReadTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferReadTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferReadTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferReadTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferReadTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferReadTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferReadTestWithParam.Blocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferReadTestWithParam.Blocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferReadTestWithParam.Blocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferReadTestWithParam.Blocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferReadTestWithParam.Blocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferReadTestWithParam.Blocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferReadTestWithParam.Blocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferReadTestWithParam.Blocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferReadTestWithParam.Blocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferReadTestWithParam.Blocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferReadTestWithParam.Blocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferReadTestWithParam.Blocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferReadTestWithParam.Blocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferReadTestWithParam.Blocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferReadTestWithParam.Blocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferReadTestWithParam.Blocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferReadTestWithParam.NonBlocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferReadTestWithParam.NonBlocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferReadTestWithParam.NonBlocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferReadTestWithParam.NonBlocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferReadTestWithParam.NonBlocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferReadTestWithParam.NonBlocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferReadTestWithParam.NonBlocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferReadTestWithParam.NonBlocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferReadTestWithParam.NonBlocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferReadTestWithParam.NonBlocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferReadTestWithParam.NonBlocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferReadTestWithParam.NonBlocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferReadTestWithParam.NonBlocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferReadTestWithParam.NonBlocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferReadTestWithParam.NonBlocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferReadTestWithParam.NonBlocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_whole_buffer_2D -urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_non_zero_offsets_2D -urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_different_buffer_sizes_2D -urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_column_2D -urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_row_2D -urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_3d -urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_3d_with_offsets -urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_2d_3d -urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_3d_2d -urEnqueueMemBufferReadRectTest.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemBufferWriteTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferWriteTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferWriteTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferWriteTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferWriteTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferWriteTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferWriteTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferWriteTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferWriteTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferWriteTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferWriteTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferWriteTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferWriteTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferWriteTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferWriteTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferWriteTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferWriteTestWithParam.SuccessWriteRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferWriteTestWithParam.SuccessWriteRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferWriteTestWithParam.SuccessWriteRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferWriteTestWithParam.SuccessWriteRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferWriteTestWithParam.SuccessWriteRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferWriteTestWithParam.SuccessWriteRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferWriteTestWithParam.SuccessWriteRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferWriteTestWithParam.SuccessWriteRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferWriteTestWithParam.SuccessWriteRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferWriteTestWithParam.SuccessWriteRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferWriteTestWithParam.SuccessWriteRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferWriteTestWithParam.SuccessWriteRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferWriteTestWithParam.SuccessWriteRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferWriteTestWithParam.SuccessWriteRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferWriteTestWithParam.SuccessWriteRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferWriteTestWithParam.SuccessWriteRead/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_WRITE -urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_ONLY -urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemBufferWriteRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_whole_buffer_2D -urEnqueueMemBufferWriteRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_non_zero_offsets_2D -urEnqueueMemBufferWriteRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_different_buffer_sizes_2D -urEnqueueMemBufferWriteRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_column_2D -urEnqueueMemBufferWriteRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_row_2D -urEnqueueMemBufferWriteRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_3d -urEnqueueMemBufferWriteRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_3d_with_offsets -urEnqueueMemBufferWriteRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_2d_3d -urEnqueueMemBufferWriteRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_3d_2d -urEnqueueMemBufferWriteRectTest.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemImageCopyTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D -urEnqueueMemImageCopyTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D -urEnqueueMemImageCopyTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D -urEnqueueMemImageCopyTest.SuccessPartialCopy/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D -urEnqueueMemImageCopyTest.SuccessPartialCopy/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D -urEnqueueMemImageCopyTest.SuccessPartialCopy/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D -urEnqueueMemImageCopyTest.SuccessPartialCopyWithSrcOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D -urEnqueueMemImageCopyTest.SuccessPartialCopyWithSrcOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D -urEnqueueMemImageCopyTest.SuccessPartialCopyWithSrcOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D -urEnqueueMemImageCopyTest.SuccessPartialCopyWithDstOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D -urEnqueueMemImageCopyTest.SuccessPartialCopyWithDstOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D -urEnqueueMemImageCopyTest.SuccessPartialCopyWithDstOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D -urEnqueueMemImageCopyTest.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D -urEnqueueMemImageCopyTest.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D -urEnqueueMemImageCopyTest.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D -urEnqueueMemImageCopyTest.InvalidNullHandleImageSrc/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D -urEnqueueMemImageCopyTest.InvalidNullHandleImageSrc/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D -urEnqueueMemImageCopyTest.InvalidNullHandleImageSrc/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D -urEnqueueMemImageCopyTest.InvalidNullHandleImageDst/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D -urEnqueueMemImageCopyTest.InvalidNullHandleImageDst/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D -urEnqueueMemImageCopyTest.InvalidNullHandleImageDst/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D -urEnqueueMemImageCopyTest.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D -urEnqueueMemImageCopyTest.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D -urEnqueueMemImageCopyTest.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D -urEnqueueMemImageCopyTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D -urEnqueueMemImageCopyTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D -urEnqueueMemImageCopyTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D -urEnqueueMemImageReadTest.Success1D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemImageReadTest.Success2D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemImageReadTest.Success3D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemImageReadTest.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemImageReadTest.InvalidOrigin1D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemImageReadTest.InvalidOrigin2D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemImageReadTest.InvalidOrigin3D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemImageReadTest.InvalidRegion1D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemImageReadTest.InvalidRegion2D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemImageReadTest.InvalidRegion3D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemImageWriteTest.Success1D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemImageWriteTest.Success2D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemImageWriteTest.Success3D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemImageWriteTest.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemImageWriteTest.InvalidOrigin1D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemImageWriteTest.InvalidOrigin2D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemImageWriteTest.InvalidOrigin3D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemImageWriteTest.InvalidRegion1D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemImageWriteTest.InvalidRegion2D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemImageWriteTest.InvalidRegion3D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueMemUnmapTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_WRITE -urEnqueueMemUnmapTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_WRITE -urEnqueueMemUnmapTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_WRITE -urEnqueueMemUnmapTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_WRITE -urEnqueueMemUnmapTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemUnmapTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemUnmapTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemUnmapTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemUnmapTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_ONLY -urEnqueueMemUnmapTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_ONLY -urEnqueueMemUnmapTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_ONLY -urEnqueueMemUnmapTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_ONLY -urEnqueueMemUnmapTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemUnmapTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemUnmapTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemUnmapTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemUnmapTestWithParam.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_WRITE -urEnqueueMemUnmapTestWithParam.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_WRITE -urEnqueueMemUnmapTestWithParam.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_WRITE -urEnqueueMemUnmapTestWithParam.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_WRITE -urEnqueueMemUnmapTestWithParam.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemUnmapTestWithParam.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemUnmapTestWithParam.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemUnmapTestWithParam.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemUnmapTestWithParam.InvalidNullHandleMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_WRITE -urEnqueueMemUnmapTestWithParam.InvalidNullHandleMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_WRITE -urEnqueueMemUnmapTestWithParam.InvalidNullHandleMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_WRITE -urEnqueueMemUnmapTestWithParam.InvalidNullHandleMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_WRITE -urEnqueueMemUnmapTestWithParam.InvalidNullHandleMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullHandleMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullHandleMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullHandleMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullHandleMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullHandleMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullHandleMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullHandleMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullHandleMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemUnmapTestWithParam.InvalidNullHandleMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemUnmapTestWithParam.InvalidNullHandleMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemUnmapTestWithParam.InvalidNullHandleMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemUnmapTestWithParam.InvalidNullPtrMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_WRITE -urEnqueueMemUnmapTestWithParam.InvalidNullPtrMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_WRITE -urEnqueueMemUnmapTestWithParam.InvalidNullPtrMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_WRITE -urEnqueueMemUnmapTestWithParam.InvalidNullPtrMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_WRITE -urEnqueueMemUnmapTestWithParam.InvalidNullPtrMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullPtrMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullPtrMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullPtrMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullPtrMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullPtrMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullPtrMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullPtrMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullPtrMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemUnmapTestWithParam.InvalidNullPtrMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemUnmapTestWithParam.InvalidNullPtrMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemUnmapTestWithParam.InvalidNullPtrMap/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_WRITE -urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_WRITE -urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_WRITE -urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_WRITE -urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_WRITE_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_READ_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_READ_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_READ_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_READ_ONLY -urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER -urEnqueueUSMFillTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__1__patternSize__1 -urEnqueueUSMFillTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__256 -urEnqueueUSMFillTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__1024__patternSize__256 -urEnqueueUSMFillTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__4 -urEnqueueUSMFillTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__8 -urEnqueueUSMFillTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__16 -urEnqueueUSMFillTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__32 -urEnqueueUSMFillNegativeTest.InvalidEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMFill2DNegativeTest.OutOfBounds/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMFill2DNegativeTest.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMAdviseWithParamTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_USM_ADVICE_FLAG_DEFAULT -urEnqueueUSMAdviseTest.MultipleParamsSuccess/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMAdviseTest.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMAdviseTest.InvalidNullPointerMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMAdviseTest.InvalidEnumeration/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMAdviseTest.InvalidSizeZero/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMAdviseTest.InvalidSizeTooLarge/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMAdviseTest.NonCoherentDeviceMemorySuccessOrWarning/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMMemcpyTest.Blocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMMemcpyTest.BlockingWithEvent/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMMemcpyTest.NonBlocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMMemcpyTest.WaitForDependencies/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMMemcpyTest.InvalidNullQueueHandle/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMMemcpyTest.InvalidNullDst/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMMemcpyTest.InvalidNullSrc/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMMemcpyTest.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMPrefetchWithParamTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_USM_MIGRATION_FLAG_DEFAULT -urEnqueueUSMPrefetchWithParamTest.CheckWaitEvent/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_USM_MIGRATION_FLAG_DEFAULT -urEnqueueUSMPrefetchTest.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMPrefetchTest.InvalidNullPointerMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMPrefetchTest.InvalidEnumeration/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMPrefetchTest.InvalidSizeZero/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMPrefetchTest.InvalidSizeTooLarge/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueUSMPrefetchTest.InvalidEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueReadHostPipeTest.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueReadHostPipeTest.InvalidNullHandleProgram/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueReadHostPipeTest.InvalidNullPointerPipeSymbol/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueReadHostPipeTest.InvalidNullPointerBuffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueReadHostPipeTest.InvalidEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueWriteHostPipeTest.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueWriteHostPipeTest.InvalidNullHandleProgram/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueWriteHostPipeTest.InvalidNullPointerPipeSymbol/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueWriteHostPipeTest.InvalidNullPointerBuffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueWriteHostPipeTest.InvalidEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueTimestampRecordingExpTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueTimestampRecordingExpTest.SuccessBlocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEnqueueTimestampRecordingExpTest.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ diff --git a/test/conformance/enqueue/enqueue_adapter_level_zero.match b/test/conformance/enqueue/enqueue_adapter_level_zero.match index 7e9b81485b..1c85a579b9 100644 --- a/test/conformance/enqueue/enqueue_adapter_level_zero.match +++ b/test/conformance/enqueue/enqueue_adapter_level_zero.match @@ -1,2 +1,25 @@ +{{NONDETERMINISTIC}} {{OPT}}urEnqueueEventsWaitTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +{{OPT}}urEnqueueKernelLaunchTest.InvalidKernelArgs/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +{{OPT}}urEnqueueKernelLaunchKernelWgSizeTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +{{OPT}}urEnqueueKernelLaunchKernelSubGroupTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +{{OPT}}urEnqueueKernelLaunchUSMLinkedList.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}__UsePoolEnabled +{{OPT}}urEnqueueKernelLaunchUSMLinkedList.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}__UsePoolDisabled +{{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___copy_2d_3d +{{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___copy_3d_2d +{{OPT}}urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_non_zero_offsets_2D +{{OPT}}urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_different_buffer_sizes_2D +{{OPT}}urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_column_2D +{{OPT}}urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_3d_with_offsets +{{OPT}}urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_2d_3d +{{OPT}}urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_3d_2d +{{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_3d_with_offsets +{{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_2d_3d +{{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_3d_2d +{{OPT}}urEnqueueMemImageCopyTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D +{{OPT}}urEnqueueMemImageCopyTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D +{{OPT}}urEnqueueMemImageCopyTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D +{{OPT}}urEnqueueMemImageReadTest.InvalidOrigin1D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +{{OPT}}urEnqueueMemImageReadTest.InvalidOrigin2D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +{{OPT}}urEnqueueMemImageReadTest.InvalidOrigin3D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ {{Segmentation fault|Aborted}} diff --git a/test/conformance/enqueue/enqueue_adapter_level_zero_v2.match b/test/conformance/enqueue/enqueue_adapter_level_zero_v2.match new file mode 100644 index 0000000000..65a7754f6c --- /dev/null +++ b/test/conformance/enqueue/enqueue_adapter_level_zero_v2.match @@ -0,0 +1,104 @@ +{{NONDETERMINISTIC}} +urEnqueueKernelLaunchTest.InvalidKernelArgs/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueKernelLaunchKernelWgSizeTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueKernelLaunchWithVirtualMemory.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueKernelLaunchUSMLinkedList.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled +urEnqueueKernelLaunchUSMLinkedList.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled +{{OPT}}urEnqueueKernelLaunchIncrementTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UseEventsEnabled +{{OPT}}urEnqueueKernelLaunchIncrementTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UseEventsDisabled +{{OPT}}urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest.Success/UseEventsNoQueuePerThread +{{OPT}}urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest.Success/NoUseEventsNoQueuePerThread +{{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___copy_whole_buffer_2D +{{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___copy_non_zero_offsets_2D +{{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___copy_different_buffer_sizes_2D +{{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___copy_column_2D +{{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___copy_row_2D +{{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___copy_3d +{{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___copy_3d_with_offsets +{{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___copy_2d_3d +{{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___copy_3d_2d +{{OPT}}urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_whole_buffer_2D +{{OPT}}urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_non_zero_offsets_2D +{{OPT}}urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_different_buffer_sizes_2D +{{OPT}}urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_column_2D +{{OPT}}urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_row_2D +{{OPT}}urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_3d +{{OPT}}urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_3d_with_offsets +{{OPT}}urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_2d_3d +{{OPT}}urEnqueueMemBufferReadRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_3d_2d +{{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_whole_buffer_2D +{{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_non_zero_offsets_2D +{{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_different_buffer_sizes_2D +{{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_column_2D +{{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_row_2D +{{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_3d +{{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_3d_with_offsets +{{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_2d_3d +{{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___write_3d_2d +urEnqueueMemImageCopyTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D +urEnqueueMemImageCopyTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D +urEnqueueMemImageCopyTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D +urEnqueueMemImageCopyTest.SuccessPartialCopy/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D +urEnqueueMemImageCopyTest.SuccessPartialCopy/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D +urEnqueueMemImageCopyTest.SuccessPartialCopy/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D +urEnqueueMemImageCopyTest.SuccessPartialCopyWithSrcOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D +urEnqueueMemImageCopyTest.SuccessPartialCopyWithSrcOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D +urEnqueueMemImageCopyTest.SuccessPartialCopyWithSrcOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D +urEnqueueMemImageCopyTest.SuccessPartialCopyWithDstOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D +urEnqueueMemImageCopyTest.SuccessPartialCopyWithDstOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D +urEnqueueMemImageCopyTest.SuccessPartialCopyWithDstOffset/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D +urEnqueueMemImageCopyTest.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D +urEnqueueMemImageCopyTest.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D +urEnqueueMemImageCopyTest.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D +urEnqueueMemImageCopyTest.InvalidNullHandleImageSrc/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D +urEnqueueMemImageCopyTest.InvalidNullHandleImageSrc/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D +urEnqueueMemImageCopyTest.InvalidNullHandleImageSrc/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D +urEnqueueMemImageCopyTest.InvalidNullHandleImageDst/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D +urEnqueueMemImageCopyTest.InvalidNullHandleImageDst/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D +urEnqueueMemImageCopyTest.InvalidNullHandleImageDst/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D +urEnqueueMemImageCopyTest.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D +urEnqueueMemImageCopyTest.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D +urEnqueueMemImageCopyTest.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D +urEnqueueMemImageCopyTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1D +urEnqueueMemImageCopyTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2D +urEnqueueMemImageCopyTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3D +urEnqueueMemImageReadTest.Success1D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageReadTest.Success2D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageReadTest.Success3D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageReadTest.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageReadTest.InvalidNullHandleImage/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageReadTest.InvalidNullPointerDst/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageReadTest.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageReadTest.InvalidOrigin1D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageReadTest.InvalidOrigin2D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageReadTest.InvalidOrigin3D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageReadTest.InvalidRegion1D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageReadTest.InvalidRegion2D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageReadTest.InvalidRegion3D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageWriteTest.Success1D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageWriteTest.Success2D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageWriteTest.Success3D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageWriteTest.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageWriteTest.InvalidNullHandleImage/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageWriteTest.InvalidNullPointerSrc/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageWriteTest.InvalidNullPtrEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageWriteTest.InvalidOrigin1D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageWriteTest.InvalidOrigin2D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageWriteTest.InvalidOrigin3D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageWriteTest.InvalidRegion1D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageWriteTest.InvalidRegion2D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueMemImageWriteTest.InvalidRegion3D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueUSMFill2DNegativeTest.OutOfBounds/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueUSMAdviseTest.InvalidSizeTooLarge/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueUSMPrefetchTest.InvalidSizeTooLarge/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueReadHostPipeTest.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueReadHostPipeTest.InvalidNullHandleProgram/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueReadHostPipeTest.InvalidNullPointerPipeSymbol/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueReadHostPipeTest.InvalidNullPointerBuffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueReadHostPipeTest.InvalidEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueWriteHostPipeTest.InvalidNullHandleQueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueWriteHostPipeTest.InvalidNullHandleProgram/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueWriteHostPipeTest.InvalidNullPointerPipeSymbol/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueWriteHostPipeTest.InvalidNullPointerBuffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEnqueueWriteHostPipeTest.InvalidEventWaitList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +{{OPT}}urEnqueueTimestampRecordingExpTest.SuccessBlocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ diff --git a/test/conformance/enqueue/enqueue_adapter_native_cpu.match b/test/conformance/enqueue/enqueue_adapter_native_cpu.match index 83e9f2391e..f1ef4ef6f6 100644 --- a/test/conformance/enqueue/enqueue_adapter_native_cpu.match +++ b/test/conformance/enqueue/enqueue_adapter_native_cpu.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} {{OPT}}urEnqueueDeviceGetGlobalVariableReadTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}urEnqueueDeviceGetGlobalVariableReadTest.InvalidNullHandleQueue/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}urEnqueueDeviceGetGlobalVariableReadTest.InvalidNullHandleProgram/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} @@ -17,6 +18,10 @@ {{OPT}}urEnqueueEventsWaitTest.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}urEnqueueEventsWaitWithBarrierTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}urEnqueueEventsWaitWithBarrierTest.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +urEnqueueEventsWaitWithBarrierOrderingTest.SuccessEventDependenciesBarrierOnly/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}_ +urEnqueueEventsWaitWithBarrierOrderingTest.SuccessEventDependenciesLaunchOnly/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}_ +urEnqueueEventsWaitWithBarrierOrderingTest.SuccessEventDependencies/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}_ +urEnqueueEventsWaitWithBarrierOrderingTest.SuccessNonEventDependencies/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}_ {{OPT}}urEnqueueKernelLaunchTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}urEnqueueKernelLaunchTest.InvalidNullHandleQueue/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}urEnqueueKernelLaunchTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} @@ -45,6 +50,7 @@ {{OPT}}urEnqueueKernelLaunchTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__3D_1_53_19 {{OPT}}urEnqueueKernelLaunchTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__3D_256_79_8 {{OPT}}urEnqueueKernelLaunchWithVirtualMemory.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}urEnqueueKernelLaunchWithUSM.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}urEnqueueKernelLaunchMultiDeviceTest.KernelLaunchReadDifferentQueues/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}urEnqueueKernelLaunchUSMLinkedList.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UsePoolEnabled {{OPT}}urEnqueueKernelLaunchUSMLinkedList.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UsePoolDisabled diff --git a/test/conformance/enqueue/enqueue_adapter_opencl.match b/test/conformance/enqueue/enqueue_adapter_opencl.match index 7bb41276d4..27ae88c43d 100644 --- a/test/conformance/enqueue/enqueue_adapter_opencl.match +++ b/test/conformance/enqueue/enqueue_adapter_opencl.match @@ -1,4 +1,3 @@ -{{OPT}}urEnqueueDeviceGetGlobalVariableReadTest.Success/Intel_R__OpenCL___{{.*}}_ +{{NONDETERMINISTIC}} urEnqueueKernelLaunchKernelWgSizeTest.Success/Intel_R__OpenCL___{{.*}}_ -urEnqueueKernelLaunchKernelSubGroupTest.Success/Intel_R__OpenCL___{{.*}}_ {{OPT}}urEnqueueKernelLaunchUSMLinkedList.Success/Intel_R__OpenCL___{{.*}}_UsePoolEnabled diff --git a/test/conformance/enqueue/urEnqueueEventsWaitWithBarrier.cpp b/test/conformance/enqueue/urEnqueueEventsWaitWithBarrier.cpp index fe630c4018..8cf6401211 100644 --- a/test/conformance/enqueue/urEnqueueEventsWaitWithBarrier.cpp +++ b/test/conformance/enqueue/urEnqueueEventsWaitWithBarrier.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2023 Intel Corporation +// Copyright (C) 2024 Intel Corporation // Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception @@ -36,6 +36,32 @@ struct urEnqueueEventsWaitWithBarrierTest : uur::urMultiQueueTest { UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueEventsWaitWithBarrierTest); +struct urEnqueueEventsWaitWithBarrierOrderingTest : uur::urProgramTest { + void SetUp() override { + program_name = "sequence"; + UUR_RETURN_ON_FATAL_FAILURE(urProgramTest::SetUp()); + ASSERT_SUCCESS(urProgramBuild(context, program, nullptr)); + + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, + sizeof(uint32_t), nullptr, &buffer)); + + auto entry_points = + uur::KernelsEnvironment::instance->GetEntryPointNames(program_name); + std::cout << entry_points[0]; + + ASSERT_SUCCESS(urKernelCreate(program, "_ZTS3Add", &add_kernel)); + ASSERT_SUCCESS(urKernelCreate(program, "_ZTS3Mul", &mul_kernel)); + } + + void TearDown() override { uur::urProgramTest::TearDown(); } + + ur_kernel_handle_t add_kernel; + ur_kernel_handle_t mul_kernel; + ur_mem_handle_t buffer = nullptr; +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueEventsWaitWithBarrierOrderingTest); + TEST_P(urEnqueueEventsWaitWithBarrierTest, Success) { ur_event_handle_t event1 = nullptr; ur_event_handle_t waitEvent = nullptr; @@ -97,3 +123,129 @@ TEST_P(urEnqueueEventsWaitWithBarrierTest, InvalidNullPtrEventWaitList) { ASSERT_SUCCESS(urEventRelease(validEvent)); } + +TEST_P(urEnqueueEventsWaitWithBarrierOrderingTest, + SuccessEventDependenciesBarrierOnly) { + constexpr size_t offset = 0; + constexpr size_t count = 1; + ur_event_handle_t event; + + uur::KernelLaunchHelper addHelper(platform, context, add_kernel, queue); + uur::KernelLaunchHelper mulHelper(platform, context, mul_kernel, queue); + + addHelper.SetBuffer1DArg(buffer, nullptr); + mulHelper.SetBuffer1DArg(buffer, nullptr); + + for (size_t i = 0; i < 10; i++) { + constexpr uint32_t ONE = 1; + urEnqueueMemBufferWrite(queue, buffer, true, 0, sizeof(uint32_t), &ONE, + 0, nullptr, &event); + EXPECT_SUCCESS( + urEnqueueEventsWaitWithBarrier(queue, 1, &event, nullptr)); + EXPECT_SUCCESS(urEnqueueKernelLaunch(queue, add_kernel, 1, &offset, + &count, nullptr, 0, nullptr, + &event)); + EXPECT_SUCCESS( + urEnqueueEventsWaitWithBarrier(queue, 1, &event, nullptr)); + EXPECT_SUCCESS(urEnqueueKernelLaunch(queue, mul_kernel, 1, &offset, + &count, nullptr, 0, nullptr, + &event)); + EXPECT_SUCCESS( + urEnqueueEventsWaitWithBarrier(queue, 1, &event, nullptr)); + addHelper.ValidateBuffer(buffer, sizeof(uint32_t), 4004); + } +} + +TEST_P(urEnqueueEventsWaitWithBarrierOrderingTest, + SuccessEventDependenciesLaunchOnly) { + constexpr size_t offset = 0; + constexpr size_t count = 1; + ur_event_handle_t event; + + uur::KernelLaunchHelper addHelper(platform, context, add_kernel, queue); + uur::KernelLaunchHelper mulHelper(platform, context, mul_kernel, queue); + + addHelper.SetBuffer1DArg(buffer, nullptr); + mulHelper.SetBuffer1DArg(buffer, nullptr); + + for (size_t i = 0; i < 10; i++) { + constexpr uint32_t ONE = 1; + urEnqueueMemBufferWrite(queue, buffer, true, 0, sizeof(uint32_t), &ONE, + 0, nullptr, nullptr); + EXPECT_SUCCESS( + urEnqueueEventsWaitWithBarrier(queue, 0, nullptr, &event)); + EXPECT_SUCCESS(urEnqueueKernelLaunch(queue, add_kernel, 1, &offset, + &count, nullptr, 1, &event, + nullptr)); + EXPECT_SUCCESS( + urEnqueueEventsWaitWithBarrier(queue, 0, nullptr, &event)); + EXPECT_SUCCESS(urEnqueueKernelLaunch(queue, mul_kernel, 1, &offset, + &count, nullptr, 1, &event, + nullptr)); + EXPECT_SUCCESS( + urEnqueueEventsWaitWithBarrier(queue, 0, nullptr, &event)); + addHelper.ValidateBuffer(buffer, sizeof(uint32_t), 4004); + } +} + +TEST_P(urEnqueueEventsWaitWithBarrierOrderingTest, SuccessEventDependencies) { + constexpr size_t offset = 0; + constexpr size_t count = 1; + ur_event_handle_t event[6]; + + uur::KernelLaunchHelper addHelper(platform, context, add_kernel, queue); + uur::KernelLaunchHelper mulHelper(platform, context, mul_kernel, queue); + + addHelper.SetBuffer1DArg(buffer, nullptr); + mulHelper.SetBuffer1DArg(buffer, nullptr); + + for (size_t i = 0; i < 10; i++) { + constexpr uint32_t ONE = 1; + urEnqueueMemBufferWrite(queue, buffer, true, 0, sizeof(uint32_t), &ONE, + 0, nullptr, &event[0]); + EXPECT_SUCCESS( + urEnqueueEventsWaitWithBarrier(queue, 1, &event[0], &event[1])); + EXPECT_SUCCESS(urEnqueueKernelLaunch(queue, add_kernel, 1, &offset, + &count, nullptr, 1, &event[1], + &event[2])); + EXPECT_SUCCESS( + urEnqueueEventsWaitWithBarrier(queue, 1, &event[2], &event[3])); + EXPECT_SUCCESS(urEnqueueKernelLaunch(queue, mul_kernel, 1, &offset, + &count, nullptr, 1, &event[3], + &event[4])); + EXPECT_SUCCESS( + urEnqueueEventsWaitWithBarrier(queue, 1, &event[4], &event[5])); + addHelper.ValidateBuffer(buffer, sizeof(uint32_t), 4004); + } +} + +TEST_P(urEnqueueEventsWaitWithBarrierOrderingTest, + SuccessNonEventDependencies) { + constexpr size_t offset = 0; + constexpr size_t count = 1; + + uur::KernelLaunchHelper addHelper(platform, context, add_kernel, queue); + uur::KernelLaunchHelper mulHelper(platform, context, mul_kernel, queue); + + addHelper.SetBuffer1DArg(buffer, nullptr); + mulHelper.SetBuffer1DArg(buffer, nullptr); + + for (size_t i = 0; i < 10; i++) { + constexpr uint32_t ONE = 1; + urEnqueueMemBufferWrite(queue, buffer, true, 0, sizeof(uint32_t), &ONE, + 0, nullptr, nullptr); + EXPECT_SUCCESS( + urEnqueueEventsWaitWithBarrier(queue, 0, nullptr, nullptr)); + EXPECT_SUCCESS(urEnqueueKernelLaunch(queue, add_kernel, 1, &offset, + &count, nullptr, 0, nullptr, + nullptr)); + EXPECT_SUCCESS( + urEnqueueEventsWaitWithBarrier(queue, 0, nullptr, nullptr)); + EXPECT_SUCCESS(urEnqueueKernelLaunch(queue, mul_kernel, 1, &offset, + &count, nullptr, 0, nullptr, + nullptr)); + EXPECT_SUCCESS( + urEnqueueEventsWaitWithBarrier(queue, 0, nullptr, nullptr)); + addHelper.ValidateBuffer(buffer, sizeof(uint32_t), 4004); + } +} diff --git a/test/conformance/enqueue/urEnqueueKernelLaunch.cpp b/test/conformance/enqueue/urEnqueueKernelLaunch.cpp index dded6a67e4..aa1c436fa8 100644 --- a/test/conformance/enqueue/urEnqueueKernelLaunch.cpp +++ b/test/conformance/enqueue/urEnqueueKernelLaunch.cpp @@ -3,6 +3,7 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include #include struct urEnqueueKernelLaunchTest : uur::urKernelExecutionTest { @@ -179,6 +180,8 @@ TEST_P(urEnqueueKernelLaunchKernelSubGroupTest, Success) { queue, kernel, n_dimensions, global_offset.data(), global_size.data(), nullptr, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); + // We specify this subgroup size in the kernel source, and then the kernel + // queries for its subgroup size at runtime and writes it to the buffer. ValidateBuffer(buffer, sizeof(size_t), 8); } @@ -229,7 +232,7 @@ inline std::string printKernelLaunchTestString( } struct urEnqueueKernelLaunchTestWithParam - : uur::urBaseKernelExecutionTestWithParam { + : uur::urKernelExecutionTestWithParam { void SetUp() override { global_range[0] = std::get<1>(GetParam()).X; global_range[1] = std::get<1>(GetParam()).Y; @@ -246,12 +249,11 @@ struct urEnqueueKernelLaunchTestWithParam program_name = "fill_3d"; buffer_size *= global_range[1] * global_range[2]; } - UUR_RETURN_ON_FATAL_FAILURE( - urBaseKernelExecutionTestWithParam::SetUp()); + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTestWithParam::SetUp()); } void TearDown() override { - UUR_RETURN_ON_FATAL_FAILURE(uur::urBaseKernelExecutionTestWithParam< + UUR_RETURN_ON_FATAL_FAILURE(uur::urKernelExecutionTestWithParam< testParametersEnqueueKernel>::TearDown()); } @@ -295,6 +297,68 @@ TEST_P(urEnqueueKernelLaunchTestWithParam, Success) { ValidateBuffer(buffer, buffer_size, val); } +struct urEnqueueKernelLaunchWithUSM : uur::urKernelExecutionTest { + + void SetUp() override { + program_name = "fill_usm"; + UUR_RETURN_ON_FATAL_FAILURE(uur::urKernelExecutionTest::SetUp()); + + ur_device_usm_access_capability_flags_t device_usm = 0; + ASSERT_SUCCESS(uur::GetDeviceUSMDeviceSupport(device, device_usm)); + if (!device_usm) { + GTEST_SKIP() << "Device USM is not supported"; + } + + alloc_size = 1024; + + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + alloc_size, &usmPtr)); + + ASSERT_SUCCESS(urQueueFinish(queue)); + } + + void TearDown() override { + + if (usmPtr) { + EXPECT_SUCCESS(urUSMFree(context, usmPtr)); + } + + UUR_RETURN_ON_FATAL_FAILURE(uur::urKernelExecutionTest::TearDown()); + } + + size_t alloc_size = 0; + void *usmPtr = nullptr; +}; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunchWithUSM); + +TEST_P(urEnqueueKernelLaunchWithUSM, Success) { + size_t work_dim = 1; + size_t global_offset = 0; + size_t global_size = alloc_size / sizeof(uint32_t); + uint32_t fill_val = 42; + + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, usmPtr)); + ASSERT_SUCCESS( + urKernelSetArgValue(kernel, 1, sizeof(fill_val), nullptr, &fill_val)); + + auto *ptr = static_cast(usmPtr); + for (size_t i = 0; i < global_size; i++) { + ptr[i] = 0; + } + + ur_event_handle_t kernel_evt; + ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, work_dim, + &global_offset, &global_size, nullptr, + 0, nullptr, &kernel_evt)); + + ASSERT_SUCCESS(urQueueFinish(queue)); + + // verify fill worked + for (size_t i = 0; i < global_size; i++) { + ASSERT_EQ(ptr[i], fill_val); + } +} + struct urEnqueueKernelLaunchWithVirtualMemory : uur::urKernelExecutionTest { void SetUp() override { diff --git a/test/conformance/enqueue/urEnqueueKernelLaunchAndMemcpyInOrder.cpp b/test/conformance/enqueue/urEnqueueKernelLaunchAndMemcpyInOrder.cpp new file mode 100644 index 0000000000..69f315e115 --- /dev/null +++ b/test/conformance/enqueue/urEnqueueKernelLaunchAndMemcpyInOrder.cpp @@ -0,0 +1,441 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include + +#include +#include + +// There was a bug in previous L0 drivers that caused the test to fail +std::tuple minL0DriverVersion = {1, 3, 29534}; + +template +struct urMultiQueueLaunchMemcpyTest : uur::urMultiDeviceContextTestTemplate<1>, + testing::WithParamInterface { + std::string KernelName; + std::vector programs; + std::vector kernels; + std::vector SharedMem; + + std::vector queues; + std::vector devices; + + std::function createQueues; + + static constexpr char ProgramName[] = "increment"; + static constexpr size_t ArraySize = 100; + static constexpr uint32_t InitialValue = 1; + + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE( + uur::urMultiDeviceContextTestTemplate<1>::SetUp()); + + createQueues(); + + for (auto &device : devices) { + SKIP_IF_DRIVER_TOO_OLD("Level-Zero", minL0DriverVersion, platform, + device); + } + + programs.resize(devices.size()); + kernels.resize(devices.size()); + SharedMem.resize(devices.size()); + + KernelName = uur::KernelsEnvironment::instance->GetEntryPointNames( + ProgramName)[0]; + + std::shared_ptr> il_binary; + std::vector metadatas{}; + + uur::KernelsEnvironment::instance->LoadSource(ProgramName, il_binary); + + for (size_t i = 0; i < devices.size(); i++) { + const ur_program_properties_t properties = { + UR_STRUCTURE_TYPE_PROGRAM_PROPERTIES, nullptr, + static_cast(metadatas.size()), + metadatas.empty() ? nullptr : metadatas.data()}; + + uur::raii::Program program; + ASSERT_SUCCESS(uur::KernelsEnvironment::instance->CreateProgram( + platform, context, devices[i], *il_binary, &properties, + &programs[i])); + + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urProgramBuild(context, programs[i], nullptr)); + ASSERT_SUCCESS( + urKernelCreate(programs[i], KernelName.data(), &kernels[i])); + + ASSERT_SUCCESS( + urUSMSharedAlloc(context, devices[i], nullptr, nullptr, + ArraySize * sizeof(uint32_t), &SharedMem[i])); + ASSERT_NE(SharedMem[i], nullptr); + + ASSERT_SUCCESS(urEnqueueUSMFill(queues[i], SharedMem[i], + sizeof(uint32_t), &InitialValue, + ArraySize * sizeof(uint32_t), 0, + nullptr, nullptr /* &Event */)); + ASSERT_SUCCESS(urQueueFinish(queues[i])); + + ASSERT_SUCCESS( + urKernelSetArgPointer(kernels[i], 0, nullptr, SharedMem[i])); + } + } + + void TearDown() override { + for (auto &Ptr : SharedMem) { + urUSMFree(context, Ptr); + } + for (const auto &queue : queues) { + EXPECT_SUCCESS(urQueueRelease(queue)); + } + for (const auto &kernel : kernels) { + urKernelRelease(kernel); + } + for (const auto &program : programs) { + urProgramRelease(program); + } + UUR_RETURN_ON_FATAL_FAILURE( + uur::urMultiDeviceContextTestTemplate<1>::TearDown()); + } + + void runBackgroundCheck(std::vector &Events) { + std::vector threads; + for (size_t i = 0; i < Events.size(); i++) { + threads.emplace_back([&, i] { + ur_event_status_t status; + do { + ASSERT_SUCCESS(urEventGetInfo( + Events[i].get(), UR_EVENT_INFO_COMMAND_EXECUTION_STATUS, + sizeof(ur_event_status_t), &status, nullptr)); + } while (status != UR_EVENT_STATUS_COMPLETE); + + auto ExpectedValue = InitialValue + i + 1; + for (uint32_t j = 0; j < ArraySize; ++j) { + ASSERT_EQ(reinterpret_cast(SharedMem[i])[j], + ExpectedValue); + } + }); + } + for (auto &thread : threads) { + thread.join(); + } + } +}; + +template +struct urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam + : public urMultiQueueLaunchMemcpyTest { + static constexpr size_t duplicateDevices = 8; + + using urMultiQueueLaunchMemcpyTest::context; + using urMultiQueueLaunchMemcpyTest::queues; + using urMultiQueueLaunchMemcpyTest::devices; + using urMultiQueueLaunchMemcpyTest::kernels; + using urMultiQueueLaunchMemcpyTest::SharedMem; + + void SetUp() override { + this->createQueues = [&] { + for (size_t i = 0; i < duplicateDevices; i++) { + devices.insert( + devices.end(), + uur::KernelsEnvironment::instance->devices.begin(), + uur::KernelsEnvironment::instance->devices.end()); + } + + for (auto &device : devices) { + ur_queue_handle_t queue = nullptr; + ASSERT_SUCCESS(urQueueCreate(context, device, 0, &queue)); + queues.push_back(queue); + } + }; + + UUR_RETURN_ON_FATAL_FAILURE( + urMultiQueueLaunchMemcpyTest::SetUp()); + } + + void TearDown() override { + UUR_RETURN_ON_FATAL_FAILURE( + urMultiQueueLaunchMemcpyTest::TearDown()); + } +}; + +struct urEnqueueKernelLaunchIncrementTest + : urMultiQueueLaunchMemcpyTest< + std::tuple> { + static constexpr size_t numOps = 50; + + ur_queue_handle_t queue; + + using Param = std::tuple; + using urMultiQueueLaunchMemcpyTest::context; + using urMultiQueueLaunchMemcpyTest::queues; + using urMultiQueueLaunchMemcpyTest::devices; + using urMultiQueueLaunchMemcpyTest::kernels; + using urMultiQueueLaunchMemcpyTest::SharedMem; + + void SetUp() override { + auto device = std::get<0>(GetParam()); + + this->createQueues = [&] { + ASSERT_SUCCESS(urQueueCreate(context, device, 0, &queue)); + + // use the same queue and device for all operations + for (size_t i = 0; i < numOps; i++) { + urQueueRetain(queue); + + queues.push_back(queue); + devices.push_back(device); + } + }; + + UUR_RETURN_ON_FATAL_FAILURE( + urMultiQueueLaunchMemcpyTest::SetUp()); + } + + void TearDown() override { + urQueueRelease(queue); + UUR_RETURN_ON_FATAL_FAILURE( + urMultiQueueLaunchMemcpyTest::TearDown()); + } +}; + +UUR_TEST_SUITE_P( + urEnqueueKernelLaunchIncrementTest, + testing::ValuesIn(uur::BoolTestParam::makeBoolParam("UseEvents")), + uur::deviceTestWithParamPrinter); + +TEST_P(urEnqueueKernelLaunchIncrementTest, Success) { + constexpr size_t global_offset = 0; + constexpr size_t n_dimensions = 1; + + auto useEvents = std::get<1>(GetParam()).value; + + std::vector kernelEvents(numOps); + std::vector memcpyEvents(numOps - 1); + + ur_event_handle_t *lastMemcpyEvent = nullptr; + ur_event_handle_t *kernelEvent = nullptr; + ur_event_handle_t *memcpyEvent = nullptr; + + for (size_t i = 0; i < numOps; i++) { + if (useEvents) { + lastMemcpyEvent = memcpyEvent; + kernelEvent = kernelEvents[i].ptr(); + memcpyEvent = i < numOps - 1 ? memcpyEvents[i].ptr() : nullptr; + } + + // execute kernel that increments each element by 1 + ASSERT_SUCCESS(urEnqueueKernelLaunch( + queue, kernels[i], n_dimensions, &global_offset, &ArraySize, + nullptr, bool(lastMemcpyEvent), lastMemcpyEvent, kernelEvent)); + + // copy the memory (input for the next kernel) + if (i < numOps - 1) { + ASSERT_SUCCESS( + urEnqueueUSMMemcpy(queue, false, SharedMem[i + 1], SharedMem[i], + ArraySize * sizeof(uint32_t), useEvents, + kernelEvent, memcpyEvent)); + } + } + + if (useEvents) { + ASSERT_SUCCESS(urEventWait(1, kernelEvents.back().ptr())); + } else { + ASSERT_SUCCESS(urQueueFinish(queue)); + } + + size_t ExpectedValue = InitialValue; + for (size_t i = 0; i < numOps; i++) { + ExpectedValue++; + for (uint32_t j = 0; j < ArraySize; ++j) { + ASSERT_EQ(reinterpret_cast(SharedMem[i])[j], + ExpectedValue); + } + } +} + +template +inline std::string +printParams(const testing::TestParamInfo &info) { + std::stringstream ss; + + auto param1 = std::get<0>(info.param); + ss << (param1.value ? "" : "No") << param1.name; + + auto param2 = std::get<1>(info.param); + ss << (param2.value ? "" : "No") << param2.name; + + if constexpr (std::tuple_size_v < typename T::ParamType >> 2) { + auto param3 = std::get<2>(info.param); + } + + return ss.str(); +} + +using urEnqueueKernelLaunchIncrementMultiDeviceTest = + urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam< + std::tuple>; + +INSTANTIATE_TEST_SUITE_P( + , urEnqueueKernelLaunchIncrementMultiDeviceTest, + testing::Combine( + testing::ValuesIn(uur::BoolTestParam::makeBoolParam("UseEventWait")), + testing::ValuesIn( + uur::BoolTestParam::makeBoolParam("RunBackgroundCheck"))), + printParams); + +// Do a chain of kernelLaunch(dev0) -> memcpy(dev0, dev1) -> kernelLaunch(dev1) ... ops +TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceTest, Success) { + auto waitOnEvent = std::get<0>(GetParam()).value; + auto runBackgroundCheck = std::get<1>(GetParam()).value; + + size_t returned_size; + ASSERT_SUCCESS(urDeviceGetInfo(devices[0], UR_DEVICE_INFO_EXTENSIONS, 0, + nullptr, &returned_size)); + + std::unique_ptr returned_extensions(new char[returned_size]); + + ASSERT_SUCCESS(urDeviceGetInfo(devices[0], UR_DEVICE_INFO_EXTENSIONS, + returned_size, returned_extensions.get(), + nullptr)); + + std::string_view extensions_string(returned_extensions.get()); + const bool usm_p2p_support = + extensions_string.find(UR_USM_P2P_EXTENSION_STRING_EXP) != + std::string::npos; + + if (!usm_p2p_support) { + GTEST_SKIP() << "EXP usm p2p feature is not supported."; + } + + constexpr size_t global_offset = 0; + constexpr size_t n_dimensions = 1; + + std::vector kernelEvents(devices.size()); + std::vector memcpyEvents(devices.size() - 1); + + ur_event_handle_t *lastMemcpyEvent = nullptr; + ur_event_handle_t *kernelEvent = nullptr; + ur_event_handle_t *memcpyEvent = nullptr; + + for (size_t i = 0; i < devices.size(); i++) { + lastMemcpyEvent = memcpyEvent; + kernelEvent = kernelEvents[i].ptr(); + memcpyEvent = i < devices.size() - 1 ? memcpyEvents[i].ptr() : nullptr; + + // execute kernel that increments each element by 1 + ASSERT_SUCCESS(urEnqueueKernelLaunch( + queues[i], kernels[i], n_dimensions, &global_offset, &ArraySize, + nullptr, bool(lastMemcpyEvent), lastMemcpyEvent, kernelEvent)); + + // copy the memory to next device + if (i < devices.size() - 1) { + ASSERT_SUCCESS(urEnqueueUSMMemcpy( + queues[i], false, SharedMem[i + 1], SharedMem[i], + ArraySize * sizeof(uint32_t), 1, kernelEvent, memcpyEvent)); + } + } + + // While the device(s) execute, loop over the events and if completed, verify the results + if (runBackgroundCheck) { + this->runBackgroundCheck(kernelEvents); + } + + // synchronize on the last queue/event only, this has to ensure all the operations + // are completed + if (waitOnEvent) { + ASSERT_SUCCESS(urEventWait(1, kernelEvents.back().ptr())); + } else { + ASSERT_SUCCESS(urQueueFinish(queues.back())); + } + + size_t ExpectedValue = InitialValue; + for (size_t i = 0; i < devices.size(); i++) { + ExpectedValue++; + for (uint32_t j = 0; j < ArraySize; ++j) { + ASSERT_EQ(reinterpret_cast(SharedMem[i])[j], + ExpectedValue); + } + } +} + +using urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest = + urEnqueueKernelLaunchIncrementMultiDeviceTestWithParam< + std::tuple>; + +INSTANTIATE_TEST_SUITE_P( + , urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest, + testing::Combine( + testing::ValuesIn(uur::BoolTestParam::makeBoolParam("UseEvents")), + testing::ValuesIn(uur::BoolTestParam::makeBoolParam("QueuePerThread"))), + printParams); + +// Enqueue kernelLaunch concurrently from multiple threads +// With !queuePerThread this becomes a test on a single device +TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest, Success) { + size_t numThreads = devices.size(); + std::vector threads; + + static constexpr size_t numOpsPerThread = 6; + + auto useEvents = std::get<0>(GetParam()).value; + auto queuePerThread = std::get<1>(GetParam()).value; + + for (size_t i = 0; i < numThreads; i++) { + threads.emplace_back([this, i, queuePerThread, useEvents]() { + constexpr size_t global_offset = 0; + constexpr size_t n_dimensions = 1; + + auto queue = queuePerThread ? queues[i] : queues.back(); + auto kernel = kernels[i]; + auto sharedPtr = SharedMem[i]; + + std::vector Events(numOpsPerThread + 1); + for (size_t j = 0; j < numOpsPerThread; j++) { + size_t waitNum = 0; + ur_event_handle_t *lastEvent = nullptr; + ur_event_handle_t *signalEvent = nullptr; + + if (useEvents) { + waitNum = j > 0 ? 1 : 0; + lastEvent = j > 0 ? Events[j - 1].ptr() : nullptr; + signalEvent = Events[j].ptr(); + } + + // execute kernel that increments each element by 1 + ASSERT_SUCCESS(urEnqueueKernelLaunch( + queue, kernel, n_dimensions, &global_offset, &ArraySize, + nullptr, waitNum, lastEvent, signalEvent)); + } + + std::vector data(ArraySize); + + auto lastEvent = + useEvents ? Events[numOpsPerThread - 1].ptr() : nullptr; + auto signalEvent = useEvents ? Events.back().ptr() : nullptr; + ASSERT_SUCCESS( + urEnqueueUSMMemcpy(queue, false, data.data(), sharedPtr, + ArraySize * sizeof(uint32_t), useEvents, + lastEvent, signalEvent)); + + if (useEvents) { + ASSERT_SUCCESS(urEventWait(1, Events.back().ptr())); + } else { + ASSERT_SUCCESS(urQueueFinish(queue)); + } + + size_t ExpectedValue = InitialValue; + ExpectedValue += numOpsPerThread; + for (uint32_t j = 0; j < ArraySize; ++j) { + ASSERT_EQ(data[j], ExpectedValue); + } + }); + } + + for (auto &thread : threads) { + thread.join(); + } +} diff --git a/test/conformance/enqueue/urEnqueueMemBufferFill.cpp b/test/conformance/enqueue/urEnqueueMemBufferFill.cpp index f4c8caabc9..89539557fe 100644 --- a/test/conformance/enqueue/urEnqueueMemBufferFill.cpp +++ b/test/conformance/enqueue/urEnqueueMemBufferFill.cpp @@ -19,8 +19,13 @@ struct urEnqueueMemBufferFillTest pattern_size = std::get<1>(GetParam()).pattern_size; pattern = std::vector(pattern_size); uur::generateMemFillPattern(pattern); - ASSERT_SUCCESS(urMemBufferCreate(this->context, UR_MEM_FLAG_READ_WRITE, - size, nullptr, &buffer)); + auto ret = urMemBufferCreate(this->context, UR_MEM_FLAG_READ_WRITE, + size, nullptr, &buffer); + if (ret == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { + GTEST_SKIP() << "Buffer creation is not supported"; + } else { + EXPECT_EQ(UR_RESULT_SUCCESS, ret); + } } void TearDown() override { diff --git a/test/conformance/enqueue/urEnqueueTimestampRecording.cpp b/test/conformance/enqueue/urEnqueueTimestampRecording.cpp index 5fc8ee5547..d3a97e3b48 100644 --- a/test/conformance/enqueue/urEnqueueTimestampRecording.cpp +++ b/test/conformance/enqueue/urEnqueueTimestampRecording.cpp @@ -35,13 +35,13 @@ void common_check(ur_event_handle_t event) { ASSERT_SUCCESS(urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_END, sizeof(uint64_t), &endTime, nullptr)); - ASSERT_TRUE(queuedTime > 0); - ASSERT_TRUE(submitTime > 0); - ASSERT_TRUE(startTime > 0); - ASSERT_TRUE(endTime > 0); - ASSERT_TRUE(queuedTime == submitTime); - ASSERT_TRUE(startTime == endTime); - ASSERT_TRUE(endTime >= submitTime); + ASSERT_GT(queuedTime, 0); + ASSERT_GT(submitTime, 0); + ASSERT_GT(startTime, 0); + ASSERT_GT(endTime, 0); + ASSERT_EQ(queuedTime, submitTime); + ASSERT_EQ(startTime, endTime); + ASSERT_GE(endTime, submitTime); } TEST_P(urEnqueueTimestampRecordingExpTest, Success) { diff --git a/test/conformance/event/event_adapter_cuda.match b/test/conformance/event/event_adapter_cuda.match index 3cffb24c5f..d9e14551da 100644 --- a/test/conformance/event/event_adapter_cuda.match +++ b/test/conformance/event/event_adapter_cuda.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} urEventGetProfilingInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROFILING_INFO_COMMAND_COMPLETE urEventGetProfilingInfoWithTimingComparisonTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_ urEventSetCallbackTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_ diff --git a/test/conformance/event/event_adapter_hip.match b/test/conformance/event/event_adapter_hip.match index b25428a187..6bc909c5fd 100644 --- a/test/conformance/event/event_adapter_hip.match +++ b/test/conformance/event/event_adapter_hip.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} urEventGetProfilingInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_PROFILING_INFO_COMMAND_COMPLETE urEventGetProfilingInfoWithTimingComparisonTest.Success/AMD_HIP_BACKEND___{{.*}}_ urEventSetCallbackTest.Success/AMD_HIP_BACKEND___{{.*}}_ diff --git a/test/conformance/event/event_adapter_level_zero-v2.match b/test/conformance/event/event_adapter_level_zero-v2.match deleted file mode 100644 index a9d97d5044..0000000000 --- a/test/conformance/event/event_adapter_level_zero-v2.match +++ /dev/null @@ -1,38 +0,0 @@ - -urEventGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_EVENT_INFO_COMMAND_QUEUE -urEventGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_EVENT_INFO_CONTEXT -urEventGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_EVENT_INFO_COMMAND_TYPE -urEventGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_EVENT_INFO_COMMAND_EXECUTION_STATUS -urEventGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_EVENT_INFO_REFERENCE_COUNT -urEventGetInfoNegativeTest.InvalidNullHandle/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventGetInfoNegativeTest.InvalidEnumeration/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventGetInfoNegativeTest.InvalidSizePropSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventGetInfoNegativeTest.InvalidSizePropSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventGetInfoNegativeTest.InvalidNullPointerPropValue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventGetInfoNegativeTest.InvalidNullPointerPropSizeRet/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventGetProfilingInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_PROFILING_INFO_COMMAND_QUEUED -urEventGetProfilingInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_PROFILING_INFO_COMMAND_SUBMIT -urEventGetProfilingInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_PROFILING_INFO_COMMAND_START -urEventGetProfilingInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_PROFILING_INFO_COMMAND_END -urEventGetProfilingInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_PROFILING_INFO_COMMAND_COMPLETE -urEventGetProfilingInfoWithTimingComparisonTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventGetProfilingInfoNegativeTest.InvalidNullHandle/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventGetProfilingInfoNegativeTest.InvalidEnumeration/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventGetProfilingInfoNegativeTest.InvalidValue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventWaitTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventRetainTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventRetainTest.InvalidNullHandle/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventReleaseTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventReleaseTest.InvalidNullHandle/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventGetNativeHandleTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventGetNativeHandleTest.InvalidNullHandleEvent/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventGetNativeHandleTest.InvalidNullPointerNativeEvent/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventCreateWithNativeHandleTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventSetCallbackTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventSetCallbackTest.ValidateParameters/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventSetCallbackTest.AllStates/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventSetCallbackTest.EventAlreadyCompleted/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventSetCallbackNegativeTest.InvalidNullHandleEvent/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventSetCallbackNegativeTest.InvalidNullPointerCallback/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urEventSetCallbackNegativeTest.InvalidEnumeration/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -{{OPT}}{{Segmentation fault|Aborted}} diff --git a/test/conformance/event/event_adapter_level_zero.match b/test/conformance/event/event_adapter_level_zero.match index 32ffbeaf1e..cae719ef16 100644 --- a/test/conformance/event/event_adapter_level_zero.match +++ b/test/conformance/event/event_adapter_level_zero.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} {{OPT}}urEventGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_EVENT_INFO_COMMAND_TYPE {{OPT}}urEventGetProfilingInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_PROFILING_INFO_COMMAND_QUEUED {{OPT}}urEventGetProfilingInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_PROFILING_INFO_COMMAND_SUBMIT diff --git a/test/conformance/event/event_adapter_level_zero_v2.match b/test/conformance/event/event_adapter_level_zero_v2.match new file mode 100644 index 0000000000..911e7b6783 --- /dev/null +++ b/test/conformance/event/event_adapter_level_zero_v2.match @@ -0,0 +1,14 @@ +{{NONDETERMINISTIC}} +urEventGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_EVENT_INFO_COMMAND_QUEUE +urEventGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_EVENT_INFO_CONTEXT +urEventGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_EVENT_INFO_COMMAND_TYPE +urEventGetInfoNegativeTest.InvalidNullHandle/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEventGetInfoNegativeTest.InvalidSizePropSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEventGetInfoNegativeTest.InvalidSizePropSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +{{OPT}}urEventGetProfilingInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_PROFILING_INFO_COMMAND_QUEUED +{{OPT}}urEventGetProfilingInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_PROFILING_INFO_COMMAND_SUBMIT +urEventGetProfilingInfoWithTimingComparisonTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEventSetCallbackTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEventSetCallbackTest.ValidateParameters/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEventSetCallbackTest.AllStates/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urEventSetCallbackTest.EventAlreadyCompleted/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ diff --git a/test/conformance/event/event_adapter_native_cpu.match b/test/conformance/event/event_adapter_native_cpu.match index fe9e18f4ac..17066b6d52 100644 --- a/test/conformance/event/event_adapter_native_cpu.match +++ b/test/conformance/event/event_adapter_native_cpu.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} urEventGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_EVENT_INFO_COMMAND_QUEUE urEventGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_EVENT_INFO_CONTEXT urEventGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_EVENT_INFO_COMMAND_TYPE diff --git a/test/conformance/event/event_adapter_opencl.match b/test/conformance/event/event_adapter_opencl.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/exp_command_buffer/CMakeLists.txt b/test/conformance/exp_command_buffer/CMakeLists.txt index a8ecf793ab..9845ba86b1 100644 --- a/test/conformance/exp_command_buffer/CMakeLists.txt +++ b/test/conformance/exp_command_buffer/CMakeLists.txt @@ -4,12 +4,19 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception add_conformance_test_with_kernels_environment(exp_command_buffer - buffer_fill_kernel_update.cpp - usm_fill_kernel_update.cpp - buffer_saxpy_kernel_update.cpp - usm_saxpy_kernel_update.cpp - ndrange_update.cpp release.cpp retain.cpp - invalid_update.cpp + commands.cpp + fill.cpp + event_sync.cpp + kernel_event_sync.cpp + update/buffer_fill_kernel_update.cpp + update/invalid_update.cpp + update/kernel_handle_update.cpp + update/usm_fill_kernel_update.cpp + update/buffer_saxpy_kernel_update.cpp + update/ndrange_update.cpp + update/usm_saxpy_kernel_update.cpp + update/event_sync.cpp + update/kernel_event_sync.cpp ) diff --git a/test/conformance/exp_command_buffer/commands.cpp b/test/conformance/exp_command_buffer/commands.cpp new file mode 100644 index 0000000000..49b2444176 --- /dev/null +++ b/test/conformance/exp_command_buffer/commands.cpp @@ -0,0 +1,206 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "fixtures.h" +#include + +struct urCommandBufferCommandsTest + : uur::command_buffer::urCommandBufferExpTest { + + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE( + uur::command_buffer::urCommandBufferExpTest::SetUp()); + + // Allocate USM pointers + for (auto &device_ptr : device_ptrs) { + ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr, + allocation_size, &device_ptr)); + ASSERT_NE(device_ptr, nullptr); + } + + for (auto &buffer : buffers) { + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, + allocation_size, nullptr, + &buffer)); + + ASSERT_NE(buffer, nullptr); + } + } + + void TearDown() override { + for (auto &device_ptr : device_ptrs) { + if (device_ptr) { + EXPECT_SUCCESS(urUSMFree(context, device_ptr)); + } + } + + for (auto &buffer : buffers) { + if (buffer) { + EXPECT_SUCCESS(urMemRelease(buffer)); + } + } + + UUR_RETURN_ON_FATAL_FAILURE( + uur::command_buffer::urCommandBufferExpTest::TearDown()); + } + + static constexpr unsigned elements = 16; + static constexpr size_t allocation_size = elements * sizeof(uint32_t); + + std::array device_ptrs = {nullptr, nullptr}; + std::array buffers = {nullptr, nullptr}; +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCommandBufferCommandsTest); + +TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMMemcpyExp) { + ASSERT_SUCCESS(urCommandBufferAppendUSMMemcpyExp( + cmd_buf_handle, device_ptrs[0], device_ptrs[1], allocation_size, 0, + nullptr, 0, nullptr, nullptr, nullptr, nullptr)); +} + +TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMFillExp) { + uint32_t pattern = 42; + ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp( + cmd_buf_handle, device_ptrs[0], &pattern, sizeof(pattern), + allocation_size, 0, nullptr, 0, nullptr, nullptr, nullptr, nullptr)); +} + +TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendMemBufferCopyExp) { + ASSERT_SUCCESS(urCommandBufferAppendMemBufferCopyExp( + cmd_buf_handle, buffers[0], buffers[1], 0, 0, allocation_size, 0, + nullptr, 0, nullptr, nullptr, nullptr, nullptr)); +} + +TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendMemBufferCopyRectExp) { + ur_rect_offset_t origin{0, 0, 0}; + ur_rect_region_t region{4, 4, 1}; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferCopyRectExp( + cmd_buf_handle, buffers[0], buffers[1], origin, origin, region, 4, 16, + 4, 16, 0, nullptr, 0, nullptr, nullptr, nullptr, nullptr)); +} + +TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendMemBufferReadExp) { + std::array host_data{}; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferReadExp( + cmd_buf_handle, buffers[0], 0, allocation_size, host_data.data(), 0, + nullptr, 0, nullptr, nullptr, nullptr, nullptr)); +} + +TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendMemBufferReadRectExp) { + std::array host_data{}; + ur_rect_offset_t origin{0, 0, 0}; + ur_rect_region_t region{4, 4, 1}; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferReadRectExp( + cmd_buf_handle, buffers[0], origin, origin, region, 4, 16, 4, 16, + host_data.data(), 0, nullptr, 0, nullptr, nullptr, nullptr, nullptr)); +} + +TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendMemBufferWriteExp) { + std::array host_data{}; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferWriteExp( + cmd_buf_handle, buffers[0], 0, allocation_size, host_data.data(), 0, + nullptr, 0, nullptr, nullptr, nullptr, nullptr)); +} + +TEST_P(urCommandBufferCommandsTest, + urCommandBufferAppendMemBufferWriteRectExp) { + std::array host_data{}; + ur_rect_offset_t origin{0, 0, 0}; + ur_rect_region_t region{4, 4, 1}; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferWriteRectExp( + cmd_buf_handle, buffers[0], origin, origin, region, 4, 16, 4, 16, + host_data.data(), 0, nullptr, 0, nullptr, nullptr, nullptr, nullptr)); +} + +TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendMemBufferFillExp) { + uint32_t pattern = 42; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferFillExp( + cmd_buf_handle, buffers[0], &pattern, sizeof(pattern), 0, + allocation_size, 0, nullptr, 0, nullptr, nullptr, nullptr, nullptr)); +} + +TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMPrefetchExp) { + ASSERT_SUCCESS(urCommandBufferAppendUSMPrefetchExp( + cmd_buf_handle, device_ptrs[0], allocation_size, 0, 0, nullptr, 0, + nullptr, nullptr, nullptr, nullptr)); +} + +TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMAdviseExp) { + ASSERT_SUCCESS(urCommandBufferAppendUSMAdviseExp( + cmd_buf_handle, device_ptrs[0], allocation_size, 0, 0, nullptr, 0, + nullptr, nullptr, nullptr, nullptr)); +} + +struct urCommandBufferAppendKernelLaunchExpTest + : uur::command_buffer::urCommandBufferExpExecutionTest { + virtual void SetUp() override { + program_name = "saxpy_usm"; + UUR_RETURN_ON_FATAL_FAILURE(urCommandBufferExpExecutionTest::SetUp()); + for (auto &shared_ptr : shared_ptrs) { + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, &shared_ptr)); + ASSERT_NE(shared_ptr, nullptr); + } + + int32_t *ptrX = static_cast(shared_ptrs[1]); + int32_t *ptrY = static_cast(shared_ptrs[2]); + for (size_t i = 0; i < global_size; i++) { + ptrX[i] = i; + ptrY[i] = i * 2; + } + + // Index 0 is output + ASSERT_SUCCESS( + urKernelSetArgPointer(kernel, 0, nullptr, shared_ptrs[0])); + // Index 1 is A + ASSERT_SUCCESS(urKernelSetArgValue(kernel, 1, sizeof(A), nullptr, &A)); + // Index 2 is X + ASSERT_SUCCESS( + urKernelSetArgPointer(kernel, 2, nullptr, shared_ptrs[1])); + // Index 3 is Y + ASSERT_SUCCESS( + urKernelSetArgPointer(kernel, 3, nullptr, shared_ptrs[2])); + } + + virtual void TearDown() override { + for (auto &shared_ptr : shared_ptrs) { + if (shared_ptr) { + EXPECT_SUCCESS(urUSMFree(context, shared_ptr)); + } + } + + UUR_RETURN_ON_FATAL_FAILURE( + urCommandBufferExpExecutionTest::TearDown()); + } + + static constexpr size_t local_size = 4; + static constexpr size_t global_size = 32; + static constexpr size_t global_offset = 0; + static constexpr size_t n_dimensions = 1; + static constexpr size_t allocation_size = sizeof(uint32_t) * global_size; + static constexpr uint32_t A = 42; + std::array shared_ptrs = {nullptr, nullptr, nullptr}; +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCommandBufferAppendKernelLaunchExpTest); +TEST_P(urCommandBufferAppendKernelLaunchExpTest, Basic) { + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + cmd_buf_handle, kernel, n_dimensions, &global_offset, &global_size, + &local_size, 0, nullptr, 0, nullptr, 0, nullptr, nullptr, nullptr, + nullptr)); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + + ASSERT_SUCCESS( + urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + int32_t *ptrZ = static_cast(shared_ptrs[0]); + for (size_t i = 0; i < global_size; i++) { + uint32_t result = (A * i) + (i * 2); + ASSERT_EQ(result, ptrZ[i]); + } +} diff --git a/test/conformance/exp_command_buffer/event_sync.cpp b/test/conformance/exp_command_buffer/event_sync.cpp new file mode 100644 index 0000000000..a4356f8a29 --- /dev/null +++ b/test/conformance/exp_command_buffer/event_sync.cpp @@ -0,0 +1,507 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "fixtures.h" + +// Tests non-kernel commands using ur events for synchronization work as expected +using CommandEventSyncTest = uur::command_buffer::urCommandEventSyncTest; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(CommandEventSyncTest); + +TEST_P(CommandEventSyncTest, USMMemcpyExp) { + // Get wait event from queue fill on ptr 0 + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternX), + &patternX, allocation_size, 0, nullptr, + &external_events[0])); + + // Command to fill ptr 1 + uint32_t patternY = 0xA; + ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp( + cmd_buf_handle, device_ptrs[1], &patternY, sizeof(patternY), + allocation_size, 0, nullptr, 0, nullptr, &sync_points[0], nullptr, + nullptr)); + + // Test command overwriting ptr 1 with ptr 0 command based on queue event + ASSERT_SUCCESS(urCommandBufferAppendUSMMemcpyExp( + cmd_buf_handle, device_ptrs[1], device_ptrs[0], allocation_size, 1, + &sync_points[0], 1, &external_events[0], nullptr, &external_events[1], + nullptr)); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + ASSERT_SUCCESS( + urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr)); + + // Queue read ptr 1 based on event returned from command-buffer command + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptr.data(), + device_ptrs[1], allocation_size, 1, + &external_events[1], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternX); + } +} + +TEST_P(CommandEventSyncTest, USMFillExp) { + // Get wait event from queue fill on ptr 0 + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternX), + &patternX, allocation_size, 0, nullptr, + &external_events[0])); + + // Test fill command overwriting ptr 0 waiting on queue event + uint32_t patternY = 0xA; + ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp( + cmd_buf_handle, device_ptrs[0], &patternY, sizeof(patternY), + allocation_size, 0, nullptr, 1, &external_events[0], nullptr, + &external_events[1], nullptr)); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + ASSERT_SUCCESS( + urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr)); + + // Queue read ptr 0 based on event returned from command-buffer command + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptr.data(), + device_ptrs[0], allocation_size, 1, + &external_events[1], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternY); + } +} + +TEST_P(CommandEventSyncTest, MemBufferCopyExp) { + // Get wait event from queue fill on buffer 0 + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternX, + sizeof(patternX), 0, allocation_size, + 0, nullptr, &external_events[0])); + + // Command to fill buffer 1 + uint32_t patternY = 0xA; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferFillExp( + cmd_buf_handle, buffers[1], &patternY, sizeof(patternY), 0, + allocation_size, 0, nullptr, 0, nullptr, &sync_points[0], nullptr, + nullptr)); + + // Test command overwriting buffer 1 with buffer 0 command based on queue event + ASSERT_SUCCESS(urCommandBufferAppendMemBufferCopyExp( + cmd_buf_handle, buffers[0], buffers[1], 0, 0, allocation_size, 1, + &sync_points[0], 1, &external_events[0], nullptr, &external_events[1], + nullptr)); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + ASSERT_SUCCESS( + urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr)); + + // Queue read buffer 1 based on event returned from command-buffer command + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueMemBufferRead( + queue, buffers[1], false, 0, allocation_size, host_enqueue_ptr.data(), + 1, &external_events[1], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternX); + } +} + +TEST_P(CommandEventSyncTest, MemBufferCopyRectExp) { + // Get wait event from queue fill on buffer 0 + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternX, + sizeof(patternX), 0, allocation_size, + 0, nullptr, &external_events[0])); + + // Command to fill buffer 1 + uint32_t patternY = 0xA; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferFillExp( + cmd_buf_handle, buffers[1], &patternY, sizeof(patternY), 0, + allocation_size, 0, nullptr, 0, nullptr, &sync_points[0], nullptr, + nullptr)); + + // Test command overwriting buffer 1 with buffer 0 command based on queue event + ur_rect_offset_t src_origin{0, 0, 0}; + ur_rect_offset_t dst_origin{0, 0, 0}; + constexpr size_t rect_buffer_row_size = 16; + ur_rect_region_t region{rect_buffer_row_size, rect_buffer_row_size, 1}; + size_t src_row_pitch = rect_buffer_row_size; + size_t src_slice_pitch = allocation_size; + size_t dst_row_pitch = rect_buffer_row_size; + size_t dst_slice_pitch = allocation_size; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferCopyRectExp( + cmd_buf_handle, buffers[0], buffers[1], src_origin, dst_origin, region, + src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch, 1, + &sync_points[0], 1, &external_events[0], nullptr, &external_events[1], + nullptr)); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + ASSERT_SUCCESS( + urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr)); + + // Queue read buffer 1 based on event returned from command-buffer command + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueMemBufferRead( + queue, buffers[1], false, 0, allocation_size, host_enqueue_ptr.data(), + 1, &external_events[1], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternX); + } +} + +TEST_P(CommandEventSyncTest, MemBufferReadExp) { + // Get wait event from queue fill on buffer 0 + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternX, + sizeof(patternX), 0, allocation_size, + 0, nullptr, &external_events[0])); + + // Test command reading buffer 0 based on queue event + std::array host_command_ptr{}; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferReadExp( + cmd_buf_handle, buffers[0], 0, allocation_size, host_command_ptr.data(), + 0, nullptr, 1, &external_events[0], nullptr, &external_events[1], + nullptr)); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + ASSERT_SUCCESS( + urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr)); + + // Overwrite buffer 0 based on event returned from command-buffer command, + // then read back to verify ordering + uint32_t patternY = 0xA; + ASSERT_SUCCESS(urEnqueueMemBufferFill( + queue, buffers[0], &patternY, sizeof(patternY), 0, allocation_size, 1, + &external_events[1], &external_events[2])); + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueMemBufferRead( + queue, buffers[0], false, 0, allocation_size, host_enqueue_ptr.data(), + 1, &external_events[2], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_command_ptr[i], patternX); + ASSERT_EQ(host_enqueue_ptr[i], patternY); + } +} + +TEST_P(CommandEventSyncTest, MemBufferReadRectExp) { + // Get wait event from queue fill on buffer 0 + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternX, + sizeof(patternX), 0, allocation_size, + 0, nullptr, &external_events[0])); + + // Test command reading buffer 0 based on queue event + std::array host_command_ptr{}; + ur_rect_offset_t buffer_offset = {0, 0, 0}; + ur_rect_offset_t host_offset = {0, 0, 0}; + constexpr size_t rect_buffer_row_size = 16; + ur_rect_region_t region = {rect_buffer_row_size, rect_buffer_row_size, 1}; + size_t buffer_row_pitch = rect_buffer_row_size; + size_t buffer_slice_pitch = allocation_size; + size_t host_row_pitch = rect_buffer_row_size; + size_t host_slice_pitch = allocation_size; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferReadRectExp( + cmd_buf_handle, buffers[0], buffer_offset, host_offset, region, + buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, + host_command_ptr.data(), 0, nullptr, 1, &external_events[0], nullptr, + &external_events[1], nullptr)); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + ASSERT_SUCCESS( + urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr)); + + // Overwrite buffer 0 based on event returned from command-buffer command, + // then read back to verify ordering + uint32_t patternY = 0xA; + ASSERT_SUCCESS(urEnqueueMemBufferFill( + queue, buffers[0], &patternY, sizeof(patternY), 0, allocation_size, 1, + &external_events[1], &external_events[2])); + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueMemBufferRead( + queue, buffers[0], false, 0, allocation_size, host_enqueue_ptr.data(), + 1, &external_events[2], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_command_ptr[i], patternX); + ASSERT_EQ(host_enqueue_ptr[i], patternY); + } +} + +TEST_P(CommandEventSyncTest, MemBufferWriteExp) { + // Get wait event from queue fill on buffer 0 + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternX, + sizeof(patternX), 0, allocation_size, + 0, nullptr, &external_events[0])); + + // Test command overwriting buffer 0 based on queue event + std::array host_command_ptr{}; + uint32_t patternY = 0xA; + std::fill(host_command_ptr.begin(), host_command_ptr.end(), patternY); + ASSERT_SUCCESS(urCommandBufferAppendMemBufferWriteExp( + cmd_buf_handle, buffers[0], 0, allocation_size, host_command_ptr.data(), + 0, nullptr, 1, &external_events[0], nullptr, &external_events[1], + nullptr)); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + ASSERT_SUCCESS( + urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr)); + + // Read back buffer 0 based on event returned from command-buffer command + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueMemBufferRead( + queue, buffers[0], false, 0, allocation_size, host_enqueue_ptr.data(), + 1, &external_events[1], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternY) << i; + } +} + +TEST_P(CommandEventSyncTest, MemBufferWriteRectExp) { + // Get wait event from queue fill on buffer 0 + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternX, + sizeof(patternX), 0, allocation_size, + 0, nullptr, &external_events[0])); + + // Test command overwriting buffer 0 based on queue event + std::array host_command_ptr{}; + uint32_t patternY = 0xA; + std::fill(host_command_ptr.begin(), host_command_ptr.end(), patternY); + + ur_rect_offset_t buffer_offset = {0, 0, 0}; + ur_rect_offset_t host_offset = {0, 0, 0}; + constexpr size_t rect_buffer_row_size = 16; + ur_rect_region_t region = {rect_buffer_row_size, rect_buffer_row_size, 1}; + size_t buffer_row_pitch = rect_buffer_row_size; + size_t buffer_slice_pitch = allocation_size; + size_t host_row_pitch = rect_buffer_row_size; + size_t host_slice_pitch = allocation_size; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferWriteRectExp( + cmd_buf_handle, buffers[0], buffer_offset, host_offset, region, + buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, + host_command_ptr.data(), 0, nullptr, 1, &external_events[0], nullptr, + &external_events[1], nullptr)); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + ASSERT_SUCCESS( + urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr)); + + // Read back buffer 0 based on event returned from command-buffer command + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueMemBufferRead( + queue, buffers[0], false, 0, allocation_size, host_enqueue_ptr.data(), + 1, &external_events[1], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternY) << i; + } +} + +TEST_P(CommandEventSyncTest, MemBufferFillExp) { + // Get wait event from queue fill on buffer 0 + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternX, + sizeof(patternX), 0, allocation_size, + 0, nullptr, &external_events[0])); + + // Test fill command overwriting buffer 0 based on queue event + uint32_t patternY = 0xA; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferFillExp( + cmd_buf_handle, buffers[0], &patternY, sizeof(patternY), 0, + allocation_size, 0, nullptr, 1, &external_events[0], nullptr, + &external_events[1], nullptr)); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + ASSERT_SUCCESS( + urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr)); + + // Queue read buffer 0 based on event returned from command-buffer command + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueMemBufferRead( + queue, buffers[0], false, 0, allocation_size, host_enqueue_ptr.data(), + 1, &external_events[1], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternY); + } +} + +TEST_P(CommandEventSyncTest, USMPrefetchExp) { + // Get wait event from queue fill on ptr 0 + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternX), + &patternX, allocation_size, 0, nullptr, + &external_events[0])); + + // Test prefetch command waiting on queue event + ASSERT_SUCCESS(urCommandBufferAppendUSMPrefetchExp( + cmd_buf_handle, device_ptrs[1], allocation_size, 0 /* migration flags*/, + 0, nullptr, 1, &external_events[0], nullptr, &external_events[1], + nullptr)); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + ASSERT_SUCCESS( + urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr)); + + // Queue read ptr 0 based on event returned from command-buffer command + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptr.data(), + device_ptrs[0], allocation_size, 1, + &external_events[1], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternX); + } +} + +TEST_P(CommandEventSyncTest, USMAdviseExp) { + // Get wait event from queue fill on ptr 0 + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternX), + &patternX, allocation_size, 0, nullptr, + &external_events[0])); + + // Test advise command waiting on queue event + ASSERT_SUCCESS(urCommandBufferAppendUSMAdviseExp( + cmd_buf_handle, device_ptrs[0], allocation_size, 0 /* advice flags*/, 0, + nullptr, 1, &external_events[0], nullptr, &external_events[1], + nullptr)); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + ASSERT_SUCCESS( + urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr)); + + // Queue read ptr 0 based on event returned from command-buffer command + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptr.data(), + device_ptrs[0], allocation_size, 1, + &external_events[1], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternX); + } +} + +TEST_P(CommandEventSyncTest, MultipleEventCommands) { + // Command to fill ptr 0 + uint32_t patternA = 0xA; + ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp( + cmd_buf_handle, device_ptrs[0], &patternA, sizeof(patternA), + allocation_size, 0, nullptr, 0, nullptr, nullptr, &external_events[0], + nullptr)); + + // Command to fill ptr 1 + uint32_t patternB = 0xB; + ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp( + cmd_buf_handle, device_ptrs[1], &patternB, sizeof(patternB), + allocation_size, 0, nullptr, 1, &external_events[0], nullptr, + &external_events[1], nullptr)); + + // Command to fill ptr 1 + uint32_t patternC = 0xC; + ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp( + cmd_buf_handle, device_ptrs[2], &patternC, sizeof(patternC), + allocation_size, 0, nullptr, 1, &external_events[1], nullptr, + &external_events[2], nullptr)); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + ASSERT_SUCCESS( + urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr)); + + // Queue read ptr 1 based on event returned from command-buffer command + std::array host_enqueue_ptrA, host_enqueue_ptrB, + host_enqueue_ptrC; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptrA.data(), + device_ptrs[0], allocation_size, 1, + &external_events[0], nullptr)); + + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptrB.data(), + device_ptrs[1], allocation_size, 1, + &external_events[1], nullptr)); + + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptrC.data(), + device_ptrs[2], allocation_size, 1, + &external_events[2], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptrA[i], patternA); + ASSERT_EQ(host_enqueue_ptrB[i], patternB); + ASSERT_EQ(host_enqueue_ptrC[i], patternC); + } +} + +TEST_P(CommandEventSyncTest, MultipleEventCommandsBetweenCommandBuffers) { + // Command to fill ptr 0 + uint32_t patternA = 0xA; + ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp( + cmd_buf_handle, device_ptrs[0], &patternA, sizeof(patternA), + allocation_size, 0, nullptr, 0, nullptr, nullptr, &external_events[0], + nullptr)); + + // Command to fill ptr 1 + uint32_t patternB = 0xB; + ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp( + cmd_buf_handle, device_ptrs[1], &patternB, sizeof(patternB), + allocation_size, 0, nullptr, 1, &external_events[0], nullptr, + &external_events[1], nullptr)); + + // Command to fill ptr 1 + uint32_t patternC = 0xC; + ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp( + cmd_buf_handle, device_ptrs[2], &patternC, sizeof(patternC), + allocation_size, 0, nullptr, 1, &external_events[1], nullptr, + &external_events[2], nullptr)); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + + // Queue read ptr 1 based on event returned from command-buffer command + std::array host_enqueue_ptrA, host_enqueue_ptrB, + host_enqueue_ptrC; + ASSERT_SUCCESS(urCommandBufferAppendUSMMemcpyExp( + second_cmd_buf_handle, host_enqueue_ptrA.data(), device_ptrs[0], + allocation_size, 0, nullptr, 1, &external_events[0], nullptr, nullptr, + nullptr)); + + ASSERT_SUCCESS(urCommandBufferAppendUSMMemcpyExp( + second_cmd_buf_handle, host_enqueue_ptrB.data(), device_ptrs[1], + allocation_size, 0, nullptr, 1, &external_events[1], nullptr, nullptr, + nullptr)); + + ASSERT_SUCCESS(urCommandBufferAppendUSMMemcpyExp( + second_cmd_buf_handle, host_enqueue_ptrC.data(), device_ptrs[2], + allocation_size, 0, nullptr, 1, &external_events[2], nullptr, nullptr, + nullptr)); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(second_cmd_buf_handle)); + ASSERT_SUCCESS( + urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(second_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptrA[i], patternA); + ASSERT_EQ(host_enqueue_ptrB[i], patternB); + ASSERT_EQ(host_enqueue_ptrC[i], patternC); + } +} diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero-v2.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero-v2.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero_v2.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero_v2.match new file mode 100644 index 0000000000..7e7ecf8d4e --- /dev/null +++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero_v2.match @@ -0,0 +1,59 @@ +{{NONDETERMINISTIC}} +urCommandBufferReleaseExpTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferReleaseExpTest.InvalidNullHandle/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferRetainExpTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferRetainExpTest.InvalidNullHandle/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferCommandsTest.urCommandBufferAppendUSMMemcpyExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferCommandsTest.urCommandBufferAppendUSMFillExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferCommandsTest.urCommandBufferAppendMemBufferCopyExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferCommandsTest.urCommandBufferAppendMemBufferCopyRectExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferCommandsTest.urCommandBufferAppendMemBufferReadExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferCommandsTest.urCommandBufferAppendMemBufferReadRectExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferCommandsTest.urCommandBufferAppendMemBufferWriteExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferCommandsTest.urCommandBufferAppendMemBufferWriteRectExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferCommandsTest.urCommandBufferAppendMemBufferFillExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferCommandsTest.urCommandBufferAppendUSMPrefetchExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferCommandsTest.urCommandBufferAppendUSMAdviseExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferAppendKernelLaunchExpTest.Basic/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__1__patternSize__1 +urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__256 +urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__1024__patternSize__256 +urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__4 +urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__8 +urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__16 +urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__32 +urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__1__patternSize__1 +urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__256 +urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__1024__patternSize__256 +urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__4 +urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__8 +urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__16 +urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__32 +KernelCommandEventSyncTest.Basic/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +KernelCommandEventSyncTest.InterCommandBuffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +KernelCommandEventSyncTest.SignalWaitBeforeEnqueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncTest.USMMemcpyExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncTest.USMFillExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncTest.MemBufferCopyExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncTest.MemBufferCopyRectExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncTest.MemBufferReadExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncTest.MemBufferReadRectExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncTest.MemBufferWriteExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncTest.MemBufferWriteRectExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncTest.MemBufferFillExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncTest.USMPrefetchExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncTest.USMAdviseExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncTest.MultipleEventCommands/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncTest.MultipleEventCommandsBetweenCommandBuffers/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncUpdateTest.USMMemcpyExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncUpdateTest.USMFillExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncUpdateTest.MemBufferCopyExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncUpdateTest.MemBufferCopyRectExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncUpdateTest.MemBufferReadExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncUpdateTest.MemBufferReadRectExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncUpdateTest.MemBufferWriteExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncUpdateTest.MemBufferWriteRectExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncUpdateTest.MemBufferFillExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncUpdateTest.USMPrefetchExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncUpdateTest.USMAdviseExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +CommandEventSyncUpdateTest.MultipleEventCommands/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match index 0a5a2b1317..2ccc267535 100644 --- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match +++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match @@ -1,8 +1,19 @@ +{{NONDETERMINISTIC}} +{{OPT}}urCommandBufferReleaseCommandExpTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}urCommandBufferReleaseCommandExpTest.ReleaseCmdBufBeforeHandle/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}urCommandBufferReleaseCommandExpTest.ReleaseCmdBufMultipleHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}urCommandBufferReleaseCommandExpTest.InvalidNullHandle/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}urCommandBufferRetainCommandExpTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}urCommandBufferRetainCommandExpTest.InvalidNullHandle/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}urCommandBufferAppendKernelLaunchExpTest.Basic/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}BufferFillCommandTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}BufferFillCommandTest.UpdateGlobalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}BufferFillCommandTest.SeparateUpdateCalls/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}BufferFillCommandTest.OverrideUpdate/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}BufferFillCommandTest.OverrideArgList/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}InvalidUpdateTest.NotFinalizedCommandBuffer/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}InvalidUpdateTest.NotUpdatableCommandBuffer/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}InvalidUpdateTest.InvalidDimensions/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}USMFillCommandTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}USMFillCommandTest.UpdateBeforeEnqueue/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}USMMultipleFillCommandTest.UpdateAllKernels/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} @@ -13,15 +24,15 @@ {{OPT}}NDRangeUpdateTest.Update3D/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}NDRangeUpdateTest.Update2D/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}NDRangeUpdateTest.Update1D/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}NDRangeUpdateTest.Invalid/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}urCommandBufferReleaseCommandExpTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}urCommandBufferReleaseCommandExpTest.ReleaseCmdBufBeforeHandle/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}urCommandBufferReleaseCommandExpTest.ReleaseCmdBufMultipleHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}urCommandBufferReleaseCommandExpTest.InvalidNullHandle/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}urCommandBufferRetainCommandExpTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}urCommandBufferRetainCommandExpTest.InvalidNullHandle/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}InvalidUpdateTest.NotFinalizedCommandBuffer/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}InvalidUpdateTest.NotUpdatableCommandBuffer/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}InvalidUpdateTest.GlobalLocalSizeMistach/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}InvalidUpdateTest.ImplToUserDefinedLocalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}InvalidUpdateTest.UserToImplDefinedLocalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}NDRangeUpdateTest.ImplToUserDefinedLocalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}NDRangeUpdateTest.UserToImplDefinedLocalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}USMSaxpyKernelTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}USMMultiSaxpyKernelTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}USMMultiSaxpyKernelTest.UpdateWithoutBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}KernelCommandEventSyncTest.Basic/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}KernelCommandEventSyncTest.InterCommandBuffer/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}KernelCommandEventSyncTest.SignalWaitBeforeEnqueue/SYCL_NATIVE_CPU__{{.*}} +{{OPT}}KernelCommandEventSyncUpdateTest.Basic/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}KernelCommandEventSyncUpdateTest.TwoWaitEvents/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}KernelCommandEventSyncUpdateTest.InvalidWaitUpdate/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}KernelCommandEventSyncUpdateTest.InvalidSignalUpdate/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_opencl.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_opencl.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/exp_command_buffer/fill.cpp b/test/conformance/exp_command_buffer/fill.cpp new file mode 100644 index 0000000000..278cc4578e --- /dev/null +++ b/test/conformance/exp_command_buffer/fill.cpp @@ -0,0 +1,140 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "fixtures.h" + +struct testParametersFill { + size_t size; + size_t pattern_size; +}; + +struct urCommandBufferFillCommandsTest + : uur::command_buffer::urCommandBufferExpTestWithParam { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE( + uur::command_buffer::urCommandBufferExpTestWithParam< + testParametersFill>::SetUp()); + + size = std::get<1>(GetParam()).size; + pattern_size = std::get<1>(GetParam()).pattern_size; + pattern = std::vector(pattern_size); + uur::generateMemFillPattern(pattern); + + // Allocate USM pointers + ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr, size, + &device_ptr)); + ASSERT_NE(device_ptr, nullptr); + + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, size, + nullptr, &buffer)); + + ASSERT_NE(buffer, nullptr); + } + + void TearDown() override { + if (device_ptr) { + EXPECT_SUCCESS(urUSMFree(context, device_ptr)); + } + + if (buffer) { + EXPECT_SUCCESS(urMemRelease(buffer)); + } + + UUR_RETURN_ON_FATAL_FAILURE( + uur::command_buffer::urCommandBufferExpTestWithParam< + testParametersFill>::TearDown()); + } + + void verifyData(std::vector &output, size_t verify_size) { + size_t pattern_index = 0; + for (size_t i = 0; i < verify_size; ++i) { + ASSERT_EQ(output[i], pattern[pattern_index]) + << "Result mismatch at index: " << i; + + ++pattern_index; + if (pattern_index % pattern_size == 0) { + pattern_index = 0; + } + } + } + + static constexpr unsigned elements = 16; + static constexpr size_t allocation_size = elements * sizeof(uint32_t); + + std::vector pattern; + size_t size; + size_t pattern_size; + + ur_exp_command_buffer_sync_point_t sync_point; + void *device_ptr = nullptr; + ur_mem_handle_t buffer = nullptr; +}; + +static std::vector test_cases{ + /* Everything set to 1 */ + {1, 1}, + /* pattern_size == size */ + {256, 256}, + /* pattern_size < size */ + {1024, 256}, + /* pattern sizes corresponding to some common scalar and vector types */ + {256, 4}, + {256, 8}, + {256, 16}, + {256, 32}}; + +template +static std::string +printFillTestString(const testing::TestParamInfo &info) { + const auto device_handle = std::get<0>(info.param); + const auto platform_device_name = + uur::GetPlatformAndDeviceName(device_handle); + std::stringstream test_name; + test_name << platform_device_name << "__size__" + << std::get<1>(info.param).size << "__patternSize__" + << std::get<1>(info.param).pattern_size; + return test_name.str(); +} + +UUR_TEST_SUITE_P(urCommandBufferFillCommandsTest, testing::ValuesIn(test_cases), + printFillTestString); + +TEST_P(urCommandBufferFillCommandsTest, Buffer) { + ASSERT_SUCCESS(urCommandBufferAppendMemBufferFillExp( + cmd_buf_handle, buffer, pattern.data(), pattern_size, 0, size, 0, + nullptr, 0, nullptr, &sync_point, nullptr, nullptr)); + + std::vector output(size, 1); + ASSERT_SUCCESS(urCommandBufferAppendMemBufferReadExp( + cmd_buf_handle, buffer, 0, size, output.data(), 1, &sync_point, 0, + nullptr, nullptr, nullptr, nullptr)); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + + ASSERT_SUCCESS( + urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + verifyData(output, size); +} + +TEST_P(urCommandBufferFillCommandsTest, USM) { + ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp( + cmd_buf_handle, device_ptr, pattern.data(), pattern_size, size, 0, + nullptr, 0, nullptr, &sync_point, nullptr, nullptr)); + + std::vector output(size, 1); + ASSERT_SUCCESS(urCommandBufferAppendUSMMemcpyExp( + cmd_buf_handle, output.data(), device_ptr, size, 1, &sync_point, 0, + nullptr, nullptr, nullptr, nullptr)); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + + ASSERT_SUCCESS( + urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + verifyData(output, size); +} diff --git a/test/conformance/exp_command_buffer/fixtures.h b/test/conformance/exp_command_buffer/fixtures.h index eeb0a5d5d8..42bee05b5a 100644 --- a/test/conformance/exp_command_buffer/fixtures.h +++ b/test/conformance/exp_command_buffer/fixtures.h @@ -11,34 +11,49 @@ namespace uur { namespace command_buffer { -struct urCommandBufferExpTest : uur::urContextTest { - void SetUp() override { - UUR_RETURN_ON_FATAL_FAILURE(uur::urContextTest::SetUp()); +static void checkCommandBufferSupport(ur_device_handle_t device) { + size_t returned_size; + ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_EXTENSIONS, 0, + nullptr, &returned_size)); - size_t returned_size; - ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_EXTENSIONS, 0, - nullptr, &returned_size)); + std::unique_ptr returned_extensions(new char[returned_size]); - std::unique_ptr returned_extensions(new char[returned_size]); + ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_EXTENSIONS, + returned_size, returned_extensions.get(), + nullptr)); - ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_EXTENSIONS, - returned_size, returned_extensions.get(), - nullptr)); + std::string_view extensions_string(returned_extensions.get()); + bool command_buffer_support = + extensions_string.find(UR_COMMAND_BUFFER_EXTENSION_STRING_EXP) != + std::string::npos; - std::string_view extensions_string(returned_extensions.get()); - bool command_buffer_support = - extensions_string.find(UR_COMMAND_BUFFER_EXTENSION_STRING_EXP) != - std::string::npos; - - if (!command_buffer_support) { - GTEST_SKIP() << "EXP command-buffer feature is not supported."; - } + if (!command_buffer_support) { + GTEST_SKIP() << "EXP command-buffer feature is not supported."; + } +} + +static void checkCommandBufferUpdateSupport( + ur_device_handle_t device, + ur_device_command_buffer_update_capability_flags_t required_capabilities) { + ur_device_command_buffer_update_capability_flags_t update_capability_flags; + ASSERT_SUCCESS(urDeviceGetInfo( + device, UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP, + sizeof(update_capability_flags), &update_capability_flags, nullptr)); + + if (!update_capability_flags) { + GTEST_SKIP() << "Updating EXP command-buffers is not supported."; + } else if ((update_capability_flags & required_capabilities) != + required_capabilities) { + GTEST_SKIP() << "Some of the command-buffer update capabilities " + "required are not supported by the device."; + } +} - ASSERT_SUCCESS(urDeviceGetInfo( - device, UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP, - sizeof(ur_bool_t), &updatable_command_buffer_support, nullptr)); +struct urCommandBufferExpTest : uur::urContextTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(uur::urContextTest::SetUp()); - // Create a command-buffer + UUR_RETURN_ON_FATAL_FAILURE(checkCommandBufferSupport(device)); ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, nullptr, &cmd_buf_handle)); ASSERT_NE(cmd_buf_handle, nullptr); @@ -52,40 +67,34 @@ struct urCommandBufferExpTest : uur::urContextTest { } ur_exp_command_buffer_handle_t cmd_buf_handle = nullptr; - ur_bool_t updatable_command_buffer_support = false; }; -struct urCommandBufferExpExecutionTest : uur::urKernelExecutionTest { +template +struct urCommandBufferExpTestWithParam : urQueueTestWithParam { void SetUp() override { - UUR_RETURN_ON_FATAL_FAILURE(uur::urKernelExecutionTest::SetUp()); - - ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, - sizeof(backend), &backend, nullptr)); - - size_t returned_size; - ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_EXTENSIONS, 0, - nullptr, &returned_size)); - - std::unique_ptr returned_extensions(new char[returned_size]); + UUR_RETURN_ON_FATAL_FAILURE(uur::urQueueTestWithParam::SetUp()); - ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_EXTENSIONS, - returned_size, returned_extensions.get(), - nullptr)); - - std::string_view extensions_string(returned_extensions.get()); - bool command_buffer_support = - extensions_string.find(UR_COMMAND_BUFFER_EXTENSION_STRING_EXP) != - std::string::npos; + UUR_RETURN_ON_FATAL_FAILURE(checkCommandBufferSupport(this->device)); + ASSERT_SUCCESS(urCommandBufferCreateExp(this->context, this->device, + nullptr, &cmd_buf_handle)); + ASSERT_NE(cmd_buf_handle, nullptr); + } - if (!command_buffer_support) { - GTEST_SKIP() << "EXP command-buffer feature is not supported."; + void TearDown() override { + if (cmd_buf_handle) { + EXPECT_SUCCESS(urCommandBufferReleaseExp(cmd_buf_handle)); } + UUR_RETURN_ON_FATAL_FAILURE(uur::urQueueTestWithParam::TearDown()); + } - ASSERT_SUCCESS(urDeviceGetInfo( - device, UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP, - sizeof(ur_bool_t), &updatable_command_buffer_support, nullptr)); + ur_exp_command_buffer_handle_t cmd_buf_handle = nullptr; +}; + +struct urCommandBufferExpExecutionTest : uur::urKernelExecutionTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(uur::urKernelExecutionTest::SetUp()); - // Create a command-buffer + UUR_RETURN_ON_FATAL_FAILURE(checkCommandBufferSupport(device)); ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, nullptr, &cmd_buf_handle)); ASSERT_NE(cmd_buf_handle, nullptr); @@ -99,57 +108,78 @@ struct urCommandBufferExpExecutionTest : uur::urKernelExecutionTest { } ur_exp_command_buffer_handle_t cmd_buf_handle = nullptr; - ur_bool_t updatable_command_buffer_support = false; - ur_platform_backend_t backend{}; }; -struct urUpdatableCommandBufferExpExecutionTest - : urCommandBufferExpExecutionTest { +struct urUpdatableCommandBufferExpTest : uur::urQueueTest { void SetUp() override { - UUR_RETURN_ON_FATAL_FAILURE(urCommandBufferExpExecutionTest ::SetUp()); + UUR_RETURN_ON_FATAL_FAILURE(uur::urQueueTest::SetUp()); - if (!updatable_command_buffer_support) { - GTEST_SKIP() << "Updating EXP command-buffers is not supported."; - } + UUR_RETURN_ON_FATAL_FAILURE(checkCommandBufferSupport(device)); + + auto required_capabilities = + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET; + UUR_RETURN_ON_FATAL_FAILURE( + checkCommandBufferUpdateSupport(device, required_capabilities)); // Create a command-buffer with update enabled. ur_exp_command_buffer_desc_t desc{ - UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, true}; + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, true, false, + false}; ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, &desc, &updatable_cmd_buf_handle)); ASSERT_NE(updatable_cmd_buf_handle, nullptr); - - // Currently there are synchronization issue with immediate submission when used for command buffers. - // So, create queue with batched submission for this test suite if the backend is Level Zero. - if (backend == UR_PLATFORM_BACKEND_LEVEL_ZERO) { - ur_queue_flags_t flags = UR_QUEUE_FLAG_SUBMISSION_BATCHED; - ur_queue_properties_t props = { - /*.stype =*/UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, - /*.pNext =*/nullptr, - /*.flags =*/flags, - }; - ASSERT_SUCCESS(urQueueCreate(context, device, &props, &queue)); - ASSERT_NE(queue, nullptr); - } else { - queue = urCommandBufferExpExecutionTest::queue; - } } void TearDown() override { if (updatable_cmd_buf_handle) { EXPECT_SUCCESS(urCommandBufferReleaseExp(updatable_cmd_buf_handle)); } - if (backend == UR_PLATFORM_BACKEND_LEVEL_ZERO && queue) { - ASSERT_SUCCESS(urQueueRelease(queue)); - } + UUR_RETURN_ON_FATAL_FAILURE(uur::urQueueTest::TearDown()); + } + + ur_exp_command_buffer_handle_t updatable_cmd_buf_handle = nullptr; + ur_platform_backend_t backend{}; +}; + +struct urUpdatableCommandBufferExpExecutionTest : uur::urKernelExecutionTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(uur::urKernelExecutionTest::SetUp()); + + ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, + sizeof(backend), &backend, nullptr)); + UUR_RETURN_ON_FATAL_FAILURE(checkCommandBufferSupport(device)); + auto required_capabilities = + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET; UUR_RETURN_ON_FATAL_FAILURE( - urCommandBufferExpExecutionTest::TearDown()); + checkCommandBufferUpdateSupport(device, required_capabilities)); + + // Create a command-buffer with update enabled. + ur_exp_command_buffer_desc_t desc{ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, true, false, + false}; + + ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, &desc, + &updatable_cmd_buf_handle)); + ASSERT_NE(updatable_cmd_buf_handle, nullptr); + } + + void TearDown() override { + if (updatable_cmd_buf_handle) { + EXPECT_SUCCESS(urCommandBufferReleaseExp(updatable_cmd_buf_handle)); + } + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::TearDown()); } + ur_platform_backend_t backend{}; ur_exp_command_buffer_handle_t updatable_cmd_buf_handle = nullptr; - ur_queue_handle_t queue = nullptr; }; struct urCommandBufferCommandExpTest @@ -161,12 +191,14 @@ struct urCommandBufferCommandExpTest // Append 2 kernel commands to command-buffer and close command-buffer ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &global_size, &local_size, 0, nullptr, nullptr, &command_handle)); + &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr, + nullptr, nullptr, &command_handle)); ASSERT_NE(command_handle, nullptr); ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &global_size, &local_size, 0, nullptr, nullptr, &command_handle_2)); + &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr, + nullptr, nullptr, &command_handle_2)); ASSERT_NE(command_handle_2, nullptr); ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); @@ -193,6 +225,211 @@ struct urCommandBufferCommandExpTest ur_exp_command_buffer_command_handle_t command_handle = nullptr; ur_exp_command_buffer_command_handle_t command_handle_2 = nullptr; }; + +struct TestKernel { + + TestKernel(std::string Name, ur_platform_handle_t Platform, + ur_context_handle_t Context, ur_device_handle_t Device) + : Name(std::move(Name)), Platform(Platform), Context(Context), + Device(Device) {} + + virtual ~TestKernel() = default; + + virtual void buildKernel() { + std::shared_ptr> ILBinary; + std::vector Metadatas{}; + + ur_platform_backend_t Backend; + ASSERT_SUCCESS(urPlatformGetInfo(Platform, UR_PLATFORM_INFO_BACKEND, + sizeof(Backend), &Backend, nullptr)); + + ASSERT_NO_FATAL_FAILURE( + uur::KernelsEnvironment::instance->LoadSource(Name, ILBinary)); + + const ur_program_properties_t Properties = { + UR_STRUCTURE_TYPE_PROGRAM_PROPERTIES, nullptr, + static_cast(Metadatas.size()), + Metadatas.empty() ? nullptr : Metadatas.data()}; + ASSERT_SUCCESS(uur::KernelsEnvironment::instance->CreateProgram( + Platform, Context, Device, *ILBinary, &Properties, &Program)); + + auto KernelNames = + uur::KernelsEnvironment::instance->GetEntryPointNames(Name); + std::string KernelName = KernelNames[0]; + ASSERT_FALSE(KernelName.empty()); + + ASSERT_SUCCESS(urProgramBuild(Context, Program, nullptr)); + ASSERT_SUCCESS(urKernelCreate(Program, KernelName.data(), &Kernel)); + } + + virtual void setUpKernel() = 0; + + virtual void destroyKernel() { + ASSERT_SUCCESS(urKernelRelease(Kernel)); + ASSERT_SUCCESS(urProgramRelease(Program)); + }; + + virtual void validate() = 0; + + std::string Name; + ur_platform_handle_t Platform; + ur_context_handle_t Context; + ur_device_handle_t Device; + ur_program_handle_t Program; + ur_kernel_handle_t Kernel; +}; + +struct urCommandBufferMultipleKernelUpdateTest + : uur::command_buffer::urUpdatableCommandBufferExpTest { + virtual void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urUpdatableCommandBufferExpTest::SetUp()); + } + + virtual void TearDown() override { + for (auto &TestKernel : TestKernels) { + UUR_RETURN_ON_FATAL_FAILURE(TestKernel->destroyKernel()); + } + UUR_RETURN_ON_FATAL_FAILURE( + urUpdatableCommandBufferExpTest::TearDown()); + } + + void setUpKernels() { + for (auto &TestKernel : TestKernels) { + UUR_RETURN_ON_FATAL_FAILURE(TestKernel->setUpKernel()); + } + } + + std::vector> TestKernels{}; +}; + +struct urCommandEventSyncTest : urCommandBufferExpTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urCommandBufferExpTest::SetUp()); + + ur_bool_t event_support = false; + ASSERT_SUCCESS(urDeviceGetInfo( + device, UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP, + sizeof(ur_bool_t), &event_support, nullptr)); + if (!event_support) { + GTEST_SKIP() << "External event sync is not supported by device."; + } + + ur_queue_flags_t flags = UR_QUEUE_FLAG_SUBMISSION_BATCHED; + ur_queue_properties_t props = { + /*.stype =*/UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, + /*.pNext =*/nullptr, + /*.flags =*/flags, + }; + ASSERT_SUCCESS(urQueueCreate(context, device, &props, &queue)); + ASSERT_NE(queue, nullptr); + + for (auto &device_ptr : device_ptrs) { + ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr, + allocation_size, &device_ptr)); + ASSERT_NE(device_ptr, nullptr); + } + + for (auto &buffer : buffers) { + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, + allocation_size, nullptr, + &buffer)); + ASSERT_NE(buffer, nullptr); + } + + // Create a command-buffer with update enabled. + ur_exp_command_buffer_desc_t desc{ + /*.stype=*/UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, + /*.pNext =*/nullptr, + /*.isUpdatable =*/false, + /*.isInOrder =*/false, + /*.enableProfiling =*/false, + }; + + ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, &desc, + &second_cmd_buf_handle)); + ASSERT_NE(second_cmd_buf_handle, nullptr); + } + + virtual void TearDown() override { + for (auto &device_ptr : device_ptrs) { + if (device_ptr) { + EXPECT_SUCCESS(urUSMFree(context, device_ptr)); + } + } + + for (auto &event : external_events) { + if (event) { + EXPECT_SUCCESS(urEventRelease(event)); + } + } + + for (auto &buffer : buffers) { + if (buffer) { + EXPECT_SUCCESS(urMemRelease(buffer)); + } + } + + if (queue) { + EXPECT_SUCCESS(urQueueRelease(queue)); + } + + if (second_cmd_buf_handle) { + EXPECT_SUCCESS(urCommandBufferReleaseExp(second_cmd_buf_handle)); + } + + UUR_RETURN_ON_FATAL_FAILURE(urCommandBufferExpTest::TearDown()); + } + + std::array device_ptrs = {nullptr, nullptr, nullptr}; + std::array buffers = {nullptr, nullptr}; + std::array external_events = { + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr}; + std::array sync_points = {0, 0}; + ur_queue_handle_t queue = nullptr; + ur_exp_command_buffer_handle_t second_cmd_buf_handle = nullptr; + static constexpr size_t elements = 64; + static constexpr size_t allocation_size = sizeof(uint32_t) * elements; +}; + +struct urCommandEventSyncUpdateTest : urCommandEventSyncTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urCommandEventSyncTest::SetUp()); + + auto required_capabilities = + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_EVENTS; + UUR_RETURN_ON_FATAL_FAILURE( + checkCommandBufferUpdateSupport(device, required_capabilities)); + + // Create a command-buffer with update enabled. + ur_exp_command_buffer_desc_t desc{ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, true, false, + false}; + + ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, &desc, + &updatable_cmd_buf_handle)); + ASSERT_NE(updatable_cmd_buf_handle, nullptr); + } + + virtual void TearDown() override { + for (auto command_handle : command_handles) { + if (command_handle) { + EXPECT_SUCCESS( + urCommandBufferReleaseCommandExp(command_handle)); + } + } + + if (updatable_cmd_buf_handle) { + EXPECT_SUCCESS(urCommandBufferReleaseExp(updatable_cmd_buf_handle)); + } + + UUR_RETURN_ON_FATAL_FAILURE(urCommandEventSyncTest::TearDown()); + } + + ur_exp_command_buffer_handle_t updatable_cmd_buf_handle = nullptr; + std::array command_handles = { + nullptr, nullptr, nullptr}; +}; } // namespace command_buffer } // namespace uur diff --git a/test/conformance/exp_command_buffer/kernel_event_sync.cpp b/test/conformance/exp_command_buffer/kernel_event_sync.cpp new file mode 100644 index 0000000000..a7c583b938 --- /dev/null +++ b/test/conformance/exp_command_buffer/kernel_event_sync.cpp @@ -0,0 +1,223 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "fixtures.h" +#include + +// Tests kernel commands using ur events for command level synchronization work +// as expected. +struct KernelCommandEventSyncTest + : uur::command_buffer::urCommandBufferExpExecutionTest { + + void SetUp() override { + program_name = "saxpy_usm"; + UUR_RETURN_ON_FATAL_FAILURE(urCommandBufferExpExecutionTest::SetUp()); + + ur_bool_t event_support = false; + ASSERT_SUCCESS(urDeviceGetInfo( + device, UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP, + sizeof(ur_bool_t), &event_support, nullptr)); + if (!event_support) { + GTEST_SKIP() << "External event sync is not supported by device."; + } + + for (auto &device_ptr : device_ptrs) { + ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr, + allocation_size, &device_ptr)); + ASSERT_NE(device_ptr, nullptr); + } + + // Index 0 is output + ASSERT_SUCCESS( + urKernelSetArgPointer(kernel, 0, nullptr, device_ptrs[2])); + // Index 1 is A + ASSERT_SUCCESS(urKernelSetArgValue(kernel, 1, sizeof(A), nullptr, &A)); + // Index 2 is X + ASSERT_SUCCESS( + urKernelSetArgPointer(kernel, 2, nullptr, device_ptrs[0])); + // Index 3 is Y + ASSERT_SUCCESS( + urKernelSetArgPointer(kernel, 3, nullptr, device_ptrs[1])); + + // Create second command-buffer + ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, nullptr, + &second_cmd_buf_handle)); + ASSERT_NE(second_cmd_buf_handle, nullptr); + } + + virtual void TearDown() override { + for (auto &device_ptr : device_ptrs) { + if (device_ptr) { + EXPECT_SUCCESS(urUSMFree(context, device_ptr)); + } + } + + for (auto &event : external_events) { + if (event) { + EXPECT_SUCCESS(urEventRelease(event)); + } + } + + if (second_cmd_buf_handle) { + EXPECT_SUCCESS(urCommandBufferReleaseExp(second_cmd_buf_handle)); + } + + UUR_RETURN_ON_FATAL_FAILURE( + urCommandBufferExpExecutionTest::TearDown()); + } + + // First two device pointers are inputs to be tested, last is an output + // from saxpy kernel. + std::array device_ptrs = {nullptr, nullptr, nullptr}; + std::array external_events = {nullptr, nullptr}; + std::array sync_points = {0, 0}; + ur_exp_command_buffer_handle_t second_cmd_buf_handle = nullptr; + static constexpr size_t elements = 64; + static constexpr size_t global_offset = 0; + static constexpr size_t allocation_size = sizeof(uint32_t) * elements; + static constexpr size_t A = 2; +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(KernelCommandEventSyncTest); + +// Tests using a regular enqueue event as a dependency of a command-buffer +// command, and having the signal event of that command-buffer command being +// a dependency of another enqueue command. +TEST_P(KernelCommandEventSyncTest, Basic) { + // Initialize data X with queue submission + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternX), + &patternX, allocation_size, 0, nullptr, + &external_events[0])); + + // Initialize data Y with command-buffer command + uint32_t patternY = 0xA; + ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp( + cmd_buf_handle, device_ptrs[1], &patternY, sizeof(patternY), + allocation_size, 0, nullptr, 0, nullptr, &sync_points[0], nullptr, + nullptr)); + + // Kernel command for SAXPY waiting on command and signal event + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + cmd_buf_handle, kernel, 1, &global_offset, &elements, nullptr, 0, + nullptr, 1, &sync_points[0], 1, &external_events[0], &sync_points[1], + &external_events[1], nullptr)); + + // command-buffer command that reads output to host + std::array host_command_ptr{}; + ASSERT_SUCCESS(urCommandBufferAppendUSMMemcpyExp( + cmd_buf_handle, host_command_ptr.data(), device_ptrs[2], + allocation_size, 1, &sync_points[1], 0, nullptr, nullptr, nullptr, + nullptr)); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + + ASSERT_SUCCESS( + urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr)); + + // Queue command that reads output to host + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptr.data(), + device_ptrs[2], allocation_size, 1, + &external_events[1], nullptr)); + + ASSERT_SUCCESS(urQueueFinish(queue)); + + for (size_t i = 0; i < elements; i++) { + auto ref = (patternX * A) + patternY; + ASSERT_EQ(host_command_ptr[i], ref); + ASSERT_EQ(host_enqueue_ptr[i], ref); + } +} + +// Tests using events to synchronize between command-buffers: +TEST_P(KernelCommandEventSyncTest, InterCommandBuffer) { + // Initialize data X with command-buffer A command + uint32_t patternX = 42; + std::array dataX{}; + std::fill(dataX.begin(), dataX.end(), patternX); + ASSERT_SUCCESS(urCommandBufferAppendUSMMemcpyExp( + cmd_buf_handle, device_ptrs[0], dataX.data(), allocation_size, 0, + nullptr, 0, nullptr, &sync_points[0], nullptr, nullptr)); + + // Initialize data Y with command-buffer A command + uint32_t patternY = 0xA; + ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp( + cmd_buf_handle, device_ptrs[1], &patternY, sizeof(patternY), + allocation_size, 1, &sync_points[0], 0, nullptr, &sync_points[1], + &external_events[0], nullptr)); + + // Run SAXPY kernel with command-buffer B command, waiting on an event. + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + second_cmd_buf_handle, kernel, 1, &global_offset, &elements, nullptr, 0, + nullptr, 0, nullptr, 1, &external_events[0], &sync_points[1], nullptr, + nullptr)); + + // Command-buffer A command that reads output to host, waiting on an event + std::array host_command_ptr{}; + ASSERT_SUCCESS(urCommandBufferAppendUSMMemcpyExp( + second_cmd_buf_handle, host_command_ptr.data(), device_ptrs[2], + allocation_size, 1, &sync_points[1], 0, nullptr, nullptr, nullptr, + nullptr)); + + // Finalize command-buffers + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(second_cmd_buf_handle)); + + // Submit command-buffers + ASSERT_SUCCESS( + urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(second_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + // Verify execution + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + auto ref = (patternX * A) + patternY; + ASSERT_EQ(host_command_ptr[i], ref) << i; + } + + // Use new data for patternX + patternX = 666; + std::fill(dataX.begin(), dataX.end(), patternX); + + // Submit command-buffers again to check that dependencies still enforced. + ASSERT_SUCCESS( + urCommandBufferEnqueueExp(cmd_buf_handle, queue, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(second_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + // Verify second execution + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + auto ref = (patternX * A) + patternY; + ASSERT_EQ(host_command_ptr[i], ref) << i; + } +} + +// Tests behavior of waiting on signal event before command-buffer has executed +TEST_P(KernelCommandEventSyncTest, SignalWaitBeforeEnqueue) { + // Initialize data X with queue submission + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternX), + &patternX, allocation_size, 0, nullptr, + &external_events[0])); + + // Initialize data Y with command-buffer command + uint32_t patternY = 0xA; + ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp( + cmd_buf_handle, device_ptrs[1], &patternY, sizeof(patternY), + allocation_size, 0, nullptr, 0, nullptr, &sync_points[0], nullptr, + nullptr)); + + // Kernel command for SAXPY waiting on command and signal event + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + cmd_buf_handle, kernel, 1, &global_offset, &elements, nullptr, 0, + nullptr, 1, &sync_points[0], 1, &external_events[0], &sync_points[1], + &external_events[1], nullptr)); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + + // Event will be considered complete before first execution + ASSERT_SUCCESS(urEventWait(1, &external_events[1])); +} diff --git a/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp b/test/conformance/exp_command_buffer/update/buffer_fill_kernel_update.cpp similarity index 91% rename from test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp rename to test/conformance/exp_command_buffer/update/buffer_fill_kernel_update.cpp index 78e1ffd009..3e13a895ff 100644 --- a/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp +++ b/test/conformance/exp_command_buffer/update/buffer_fill_kernel_update.cpp @@ -3,7 +3,7 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include "fixtures.h" +#include "../fixtures.h" // Test that updating a command-buffer with a single kernel command // taking USM arguments works correctly. @@ -49,7 +49,8 @@ struct BufferFillCommandTest // Append kernel command to command-buffer and close command-buffer ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &global_size, &local_size, 0, nullptr, nullptr, &command_handle)); + &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr, + nullptr, nullptr, &command_handle)); ASSERT_NE(command_handle, nullptr); ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); @@ -72,7 +73,7 @@ struct BufferFillCommandTest static constexpr size_t local_size = 4; static constexpr size_t global_size = 32; static constexpr size_t global_offset = 0; - static constexpr size_t n_dimensions = 1; + static constexpr uint32_t n_dimensions = 1; static constexpr size_t buffer_size = sizeof(val) * global_size; ur_mem_handle_t buffer = nullptr; ur_mem_handle_t new_buffer = nullptr; @@ -123,10 +124,11 @@ TEST_P(BufferFillCommandTest, UpdateParameters) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 1, // numNewMemObjArgs 0, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim &new_output_desc, // pNewMemObjArgList nullptr, // pNewPointerArgList &new_input_desc, // pNewValueArgList @@ -175,10 +177,11 @@ TEST_P(BufferFillCommandTest, UpdateGlobalSize) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 1, // numNewMemObjArgs 0, // numNewPointerArgs 0, // numNewValueArgs - 1, // newWorkDim + n_dimensions, // newWorkDim &new_output_desc, // pNewMemObjArgList nullptr, // pNewPointerArgList nullptr, // pNewValueArgList @@ -225,10 +228,11 @@ TEST_P(BufferFillCommandTest, SeparateUpdateCalls) { ur_exp_command_buffer_update_kernel_launch_desc_t output_update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 1, // numNewMemObjArgs 0, // numNewPointerArgs 0, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim &new_output_desc, // pNewMemObjArgList nullptr, // pNewPointerArgList nullptr, // pNewValueArgList @@ -253,10 +257,11 @@ TEST_P(BufferFillCommandTest, SeparateUpdateCalls) { ur_exp_command_buffer_update_kernel_launch_desc_t input_update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 0, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList nullptr, // pNewPointerArgList &new_input_desc, // pNewValueArgList @@ -271,16 +276,17 @@ TEST_P(BufferFillCommandTest, SeparateUpdateCalls) { ur_exp_command_buffer_update_kernel_launch_desc_t global_size_update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext - 0, // numNewMemObjArgs - 0, // numNewPointerArgs - 0, // numNewValueArgs - static_cast(n_dimensions), // newWorkDim - nullptr, // pNewMemObjArgList - nullptr, // pNewPointerArgList - nullptr, // pNewValueArgList - nullptr, // pNewGlobalWorkOffset - &new_global_size, // pNewGlobalWorkSize - &new_local_size, // pNewLocalWorkSize + kernel, // hNewKernel + 0, // numNewMemObjArgs + 0, // numNewPointerArgs + 0, // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + nullptr, // pNewPointerArgList + nullptr, // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + &new_global_size, // pNewGlobalWorkSize + &new_local_size, // pNewLocalWorkSize }; ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp( @@ -315,10 +321,11 @@ TEST_P(BufferFillCommandTest, OverrideUpdate) { ur_exp_command_buffer_update_kernel_launch_desc_t first_update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 0, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList nullptr, // pNewPointerArgList &first_input_desc, // pNewValueArgList @@ -342,10 +349,11 @@ TEST_P(BufferFillCommandTest, OverrideUpdate) { ur_exp_command_buffer_update_kernel_launch_desc_t second_update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 0, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList nullptr, // pNewPointerArgList &second_input_desc, // pNewValueArgList @@ -398,16 +406,17 @@ TEST_P(BufferFillCommandTest, OverrideArgList) { ur_exp_command_buffer_update_kernel_launch_desc_t second_update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext - 0, // numNewMemObjArgs - 0, // numNewPointerArgs - 2, // numNewValueArgs - 0, // newWorkDim - nullptr, // pNewMemObjArgList - nullptr, // pNewPointerArgList - input_descs, // pNewValueArgList - nullptr, // pNewGlobalWorkOffset - nullptr, // pNewGlobalWorkSize - nullptr, // pNewLocalWorkSize + kernel, // hNewKernel + 0, // numNewMemObjArgs + 0, // numNewPointerArgs + 2, // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + nullptr, // pNewPointerArgList + input_descs, // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + nullptr, // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize }; ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp(command_handle, diff --git a/test/conformance/exp_command_buffer/buffer_saxpy_kernel_update.cpp b/test/conformance/exp_command_buffer/update/buffer_saxpy_kernel_update.cpp similarity index 96% rename from test/conformance/exp_command_buffer/buffer_saxpy_kernel_update.cpp rename to test/conformance/exp_command_buffer/update/buffer_saxpy_kernel_update.cpp index 55e6773cb7..858b6b5680 100644 --- a/test/conformance/exp_command_buffer/buffer_saxpy_kernel_update.cpp +++ b/test/conformance/exp_command_buffer/update/buffer_saxpy_kernel_update.cpp @@ -3,7 +3,8 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include "fixtures.h" +#include "../fixtures.h" +#include // Test that updating a command-buffer with a single kernel command // taking buffer & scalar arguments works correctly. @@ -83,7 +84,8 @@ struct BufferSaxpyKernelTest // Append kernel command to command-buffer and close command-buffer ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &global_size, &local_size, 0, nullptr, nullptr, &command_handle)); + &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr, + nullptr, nullptr, &command_handle)); ASSERT_NE(command_handle, nullptr); ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); @@ -129,7 +131,7 @@ struct BufferSaxpyKernelTest static constexpr size_t local_size = 4; static constexpr size_t global_size = 32; static constexpr size_t global_offset = 0; - static constexpr size_t n_dimensions = 1; + static constexpr uint32_t n_dimensions = 1; static constexpr uint32_t A = 42; std::array buffers = {nullptr, nullptr, nullptr, nullptr}; @@ -183,10 +185,11 @@ TEST_P(BufferSaxpyKernelTest, UpdateParameters) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 2, // numNewMemObjArgs 0, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim new_input_descs, // pNewMemObjArgList nullptr, // pNewPointerArgList &new_A_desc, // pNewValueArgList diff --git a/test/conformance/exp_command_buffer/update/event_sync.cpp b/test/conformance/exp_command_buffer/update/event_sync.cpp new file mode 100644 index 0000000000..13e1bed968 --- /dev/null +++ b/test/conformance/exp_command_buffer/update/event_sync.cpp @@ -0,0 +1,817 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "../fixtures.h" + +// Tests non-kernel commands using ur events for synchronization can be +// updated +using CommandEventSyncUpdateTest = + uur::command_buffer::urCommandEventSyncUpdateTest; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(CommandEventSyncUpdateTest); + +TEST_P(CommandEventSyncUpdateTest, USMMemcpyExp) { + // Get wait event from queue fill on ptr 0 + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternX), + &patternX, allocation_size, 0, nullptr, + &external_events[0])); + + // Command to fill ptr 1 + uint32_t patternY = 0xA; + ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp( + updatable_cmd_buf_handle, device_ptrs[1], &patternY, sizeof(patternY), + allocation_size, 0, nullptr, 0, nullptr, &sync_points[0], nullptr, + nullptr)); + + // Test command overwriting ptr 1 with ptr 0 command based on queue event + ASSERT_SUCCESS(urCommandBufferAppendUSMMemcpyExp( + updatable_cmd_buf_handle, device_ptrs[1], device_ptrs[0], + allocation_size, 1, &sync_points[0], 1, &external_events[0], nullptr, + &external_events[1], &command_handles[0])); + ASSERT_NE(nullptr, command_handles[0]); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + // Queue read ptr 1 based on event returned from command-buffer command + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptr.data(), + device_ptrs[1], allocation_size, 1, + &external_events[1], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternX); + } + + uint32_t patternZ = 666; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternZ), + &patternZ, allocation_size, 0, nullptr, + &external_events[2])); + + // Update command command-wait event to wait on fill of new value + ASSERT_SUCCESS(urCommandBufferUpdateWaitEventsExp(command_handles[0], 1, + &external_events[2])); + + // Get a new signal event for command-buffer + ASSERT_SUCCESS(urCommandBufferUpdateSignalEventExp(command_handles[0], + &external_events[3])); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptr.data(), + device_ptrs[1], allocation_size, 1, + &external_events[3], nullptr)); + + // Verify update + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternZ); + } +} + +TEST_P(CommandEventSyncUpdateTest, USMFillExp) { + // Get wait event from queue fill on ptr 0 + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternX), + &patternX, allocation_size, 0, nullptr, + &external_events[0])); + + // Test fill command overwriting ptr 0 waiting on queue event + uint32_t patternY = 0xA; + ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp( + updatable_cmd_buf_handle, device_ptrs[0], &patternY, sizeof(patternY), + allocation_size, 0, nullptr, 1, &external_events[0], nullptr, + &external_events[1], &command_handles[0])); + ASSERT_NE(nullptr, command_handles[0]); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + // Queue read ptr 0 based on event returned from command-buffer command + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptr.data(), + device_ptrs[0], allocation_size, 1, + &external_events[1], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternY); + } + + uint32_t patternZ = 666; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternZ), + &patternZ, allocation_size, 0, nullptr, + &external_events[2])); + + // Update command command-wait event to wait on fill of new value + ASSERT_SUCCESS(urCommandBufferUpdateWaitEventsExp(command_handles[0], 1, + &external_events[2])); + + // Get a new signal event for command-buffer + ASSERT_SUCCESS(urCommandBufferUpdateSignalEventExp(command_handles[0], + &external_events[3])); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptr.data(), + device_ptrs[0], allocation_size, 1, + &external_events[3], nullptr)); + + // Verify update + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternY); + } +} + +TEST_P(CommandEventSyncUpdateTest, MemBufferCopyExp) { + // Get wait event from queue fill on buffer 0 + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternX, + sizeof(patternX), 0, allocation_size, + 0, nullptr, &external_events[0])); + + // Command to fill buffer 1 + uint32_t patternY = 0xA; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferFillExp( + updatable_cmd_buf_handle, buffers[1], &patternY, sizeof(patternY), 0, + allocation_size, 0, nullptr, 0, nullptr, &sync_points[0], nullptr, + nullptr)); + + // Test command overwriting buffer 1 with buffer 0 command based on queue event + ASSERT_SUCCESS(urCommandBufferAppendMemBufferCopyExp( + updatable_cmd_buf_handle, buffers[0], buffers[1], 0, 0, allocation_size, + 1, &sync_points[0], 1, &external_events[0], nullptr, + &external_events[1], &command_handles[0])); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + // Queue read buffer 1 based on event returned from command-buffer command + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueMemBufferRead( + queue, buffers[1], false, 0, allocation_size, host_enqueue_ptr.data(), + 1, &external_events[1], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternX); + } + + uint32_t patternZ = 666; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternZ, + sizeof(patternZ), 0, allocation_size, + 0, nullptr, &external_events[2])); + + // Update command command-wait event to wait on fill of new value + ASSERT_SUCCESS(urCommandBufferUpdateWaitEventsExp(command_handles[0], 1, + &external_events[2])); + + // Get a new signal event for command-buffer + ASSERT_SUCCESS(urCommandBufferUpdateSignalEventExp(command_handles[0], + &external_events[3])); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + ASSERT_SUCCESS(urEnqueueMemBufferRead( + queue, buffers[1], false, 0, allocation_size, host_enqueue_ptr.data(), + 1, &external_events[3], nullptr)); + + // Verify update + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternZ); + } +} + +TEST_P(CommandEventSyncUpdateTest, MemBufferCopyRectExp) { + // Get wait event from queue fill on buffer 0 + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternX, + sizeof(patternX), 0, allocation_size, + 0, nullptr, &external_events[0])); + + // Command to fill buffer 1 + uint32_t patternY = 0xA; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferFillExp( + updatable_cmd_buf_handle, buffers[1], &patternY, sizeof(patternY), 0, + allocation_size, 0, nullptr, 0, nullptr, &sync_points[0], nullptr, + nullptr)); + + // Test command overwriting buffer 1 with buffer 0 command based on queue event + ur_rect_offset_t src_origin{0, 0, 0}; + ur_rect_offset_t dst_origin{0, 0, 0}; + constexpr size_t rect_buffer_row_size = 16; + ur_rect_region_t region{rect_buffer_row_size, rect_buffer_row_size, 1}; + size_t src_row_pitch = rect_buffer_row_size; + size_t src_slice_pitch = allocation_size; + size_t dst_row_pitch = rect_buffer_row_size; + size_t dst_slice_pitch = allocation_size; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferCopyRectExp( + updatable_cmd_buf_handle, buffers[0], buffers[1], src_origin, + dst_origin, region, src_row_pitch, src_slice_pitch, dst_row_pitch, + dst_slice_pitch, 1, &sync_points[0], 1, &external_events[0], nullptr, + &external_events[1], &command_handles[0])); + ASSERT_NE(nullptr, command_handles[0]); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + // Queue read buffer 1 based on event returned from command-buffer command + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueMemBufferRead( + queue, buffers[1], false, 0, allocation_size, host_enqueue_ptr.data(), + 1, &external_events[1], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternX); + } + + uint32_t patternZ = 666; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternZ, + sizeof(patternZ), 0, allocation_size, + 0, nullptr, &external_events[2])); + + // Update command command-wait event to wait on fill of new value + ASSERT_SUCCESS(urCommandBufferUpdateWaitEventsExp(command_handles[0], 1, + &external_events[2])); + + // Get a new signal event for command-buffer + ASSERT_SUCCESS(urCommandBufferUpdateSignalEventExp(command_handles[0], + &external_events[3])); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + ASSERT_SUCCESS(urEnqueueMemBufferRead( + queue, buffers[1], false, 0, allocation_size, host_enqueue_ptr.data(), + 1, &external_events[3], nullptr)); + + // Verify update + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternZ); + } +} + +TEST_P(CommandEventSyncUpdateTest, MemBufferReadExp) { + // Get wait event from queue fill on buffer 0 + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternX, + sizeof(patternX), 0, allocation_size, + 0, nullptr, &external_events[0])); + + // Test command reading buffer 0 based on queue event + std::array host_command_ptr{}; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferReadExp( + updatable_cmd_buf_handle, buffers[0], 0, allocation_size, + host_command_ptr.data(), 0, nullptr, 1, &external_events[0], nullptr, + &external_events[1], &command_handles[0])); + ASSERT_NE(nullptr, command_handles[0]); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + // Overwrite buffer 0 based on event returned from command-buffer command, + // then read back to verify ordering + uint32_t patternY = 0xA; + ASSERT_SUCCESS(urEnqueueMemBufferFill( + queue, buffers[0], &patternY, sizeof(patternY), 0, allocation_size, 1, + &external_events[1], &external_events[2])); + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueMemBufferRead( + queue, buffers[0], false, 0, allocation_size, host_enqueue_ptr.data(), + 1, &external_events[2], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_command_ptr[i], patternX); + ASSERT_EQ(host_enqueue_ptr[i], patternY); + } + + uint32_t patternZ = 666; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternZ, + sizeof(patternZ), 0, allocation_size, + 0, nullptr, &external_events[3])); + + // Update command command-wait event to wait on fill of new value + ASSERT_SUCCESS(urCommandBufferUpdateWaitEventsExp(command_handles[0], 1, + &external_events[3])); + + // Get a new signal event for command-buffer + ASSERT_SUCCESS(urCommandBufferUpdateSignalEventExp(command_handles[0], + &external_events[4])); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + uint32_t patternA = 0xF; + ASSERT_SUCCESS(urEnqueueMemBufferFill( + queue, buffers[0], &patternA, sizeof(patternA), 0, allocation_size, 1, + &external_events[4], &external_events[5])); + ASSERT_SUCCESS(urEnqueueMemBufferRead( + queue, buffers[0], false, 0, allocation_size, host_enqueue_ptr.data(), + 1, &external_events[5], nullptr)); + + // Verify update + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_command_ptr[i], patternZ); + ASSERT_EQ(host_enqueue_ptr[i], patternA); + } +} + +TEST_P(CommandEventSyncUpdateTest, MemBufferReadRectExp) { + // Get wait event from queue fill on buffer 0 + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternX, + sizeof(patternX), 0, allocation_size, + 0, nullptr, &external_events[0])); + + // Test command reading buffer 0 based on queue event + std::array host_command_ptr{}; + ur_rect_offset_t buffer_offset = {0, 0, 0}; + ur_rect_offset_t host_offset = {0, 0, 0}; + constexpr size_t rect_buffer_row_size = 16; + ur_rect_region_t region = {rect_buffer_row_size, rect_buffer_row_size, 1}; + size_t buffer_row_pitch = rect_buffer_row_size; + size_t buffer_slice_pitch = allocation_size; + size_t host_row_pitch = rect_buffer_row_size; + size_t host_slice_pitch = allocation_size; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferReadRectExp( + updatable_cmd_buf_handle, buffers[0], buffer_offset, host_offset, + region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch, + host_slice_pitch, host_command_ptr.data(), 0, nullptr, 1, + &external_events[0], nullptr, &external_events[1], + &command_handles[0])); + ASSERT_NE(nullptr, command_handles[0]); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + // Overwrite buffer 0 based on event returned from command-buffer command, + // then read back to verify ordering + uint32_t patternY = 0xA; + ASSERT_SUCCESS(urEnqueueMemBufferFill( + queue, buffers[0], &patternY, sizeof(patternY), 0, allocation_size, 1, + &external_events[1], &external_events[2])); + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueMemBufferRead( + queue, buffers[0], false, 0, allocation_size, host_enqueue_ptr.data(), + 1, &external_events[2], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_command_ptr[i], patternX); + ASSERT_EQ(host_enqueue_ptr[i], patternY); + } + + uint32_t patternZ = 666; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternZ, + sizeof(patternZ), 0, allocation_size, + 0, nullptr, &external_events[3])); + + // Update command command-wait event to wait on fill of new value + ASSERT_SUCCESS(urCommandBufferUpdateWaitEventsExp(command_handles[0], 1, + &external_events[3])); + + // Get a new signal event for command-buffer + ASSERT_SUCCESS(urCommandBufferUpdateSignalEventExp(command_handles[0], + &external_events[4])); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + uint32_t patternA = 0xF; + ASSERT_SUCCESS(urEnqueueMemBufferFill( + queue, buffers[0], &patternA, sizeof(patternA), 0, allocation_size, 1, + &external_events[4], &external_events[5])); + ASSERT_SUCCESS(urEnqueueMemBufferRead( + queue, buffers[0], false, 0, allocation_size, host_enqueue_ptr.data(), + 1, &external_events[5], nullptr)); + + // Verify update + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_command_ptr[i], patternZ); + ASSERT_EQ(host_enqueue_ptr[i], patternA); + } +} + +TEST_P(CommandEventSyncUpdateTest, MemBufferWriteExp) { + // Get wait event from queue fill on buffer 0 + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternX, + sizeof(patternX), 0, allocation_size, + 0, nullptr, &external_events[0])); + + // Test command overwriting buffer 0 based on queue event + std::array host_command_ptr{}; + uint32_t patternY = 0xA; + std::fill(host_command_ptr.begin(), host_command_ptr.end(), patternY); + ASSERT_SUCCESS(urCommandBufferAppendMemBufferWriteExp( + updatable_cmd_buf_handle, buffers[0], 0, allocation_size, + host_command_ptr.data(), 0, nullptr, 1, &external_events[0], nullptr, + &external_events[1], &command_handles[0])); + ASSERT_NE(nullptr, command_handles[0]); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + // Read back buffer 0 based on event returned from command-buffer command + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueMemBufferRead( + queue, buffers[0], false, 0, allocation_size, host_enqueue_ptr.data(), + 1, &external_events[1], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternY) << i; + } + + uint32_t patternZ = 666; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternZ, + sizeof(patternZ), 0, allocation_size, + 0, nullptr, &external_events[2])); + + // Update command command-wait event to wait on fill of new value + ASSERT_SUCCESS(urCommandBufferUpdateWaitEventsExp(command_handles[0], 1, + &external_events[2])); + + // Get a new signal event for command-buffer + ASSERT_SUCCESS(urCommandBufferUpdateSignalEventExp(command_handles[0], + &external_events[3])); + + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urEnqueueMemBufferRead( + queue, buffers[0], false, 0, allocation_size, host_enqueue_ptr.data(), + 1, &external_events[3], nullptr)); + + // Verify update + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternY); + } +} + +TEST_P(CommandEventSyncUpdateTest, MemBufferWriteRectExp) { + // Get wait event from queue fill on buffer 0 + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternX, + sizeof(patternX), 0, allocation_size, + 0, nullptr, &external_events[0])); + + // Test command overwriting buffer 0 based on queue event + std::array host_command_ptr{}; + uint32_t patternY = 0xA; + std::fill(host_command_ptr.begin(), host_command_ptr.end(), patternY); + + ur_rect_offset_t buffer_offset = {0, 0, 0}; + ur_rect_offset_t host_offset = {0, 0, 0}; + constexpr size_t rect_buffer_row_size = 16; + ur_rect_region_t region = {rect_buffer_row_size, rect_buffer_row_size, 1}; + size_t buffer_row_pitch = rect_buffer_row_size; + size_t buffer_slice_pitch = allocation_size; + size_t host_row_pitch = rect_buffer_row_size; + size_t host_slice_pitch = allocation_size; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferWriteRectExp( + updatable_cmd_buf_handle, buffers[0], buffer_offset, host_offset, + region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch, + host_slice_pitch, host_command_ptr.data(), 0, nullptr, 1, + &external_events[0], nullptr, &external_events[1], + &command_handles[0])); + ASSERT_NE(nullptr, command_handles[0]); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + // Read back buffer 0 based on event returned from command-buffer command + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueMemBufferRead( + queue, buffers[0], false, 0, allocation_size, host_enqueue_ptr.data(), + 1, &external_events[1], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternY) << i; + } + + uint32_t patternZ = 666; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternZ, + sizeof(patternZ), 0, allocation_size, + 0, nullptr, &external_events[2])); + + // Update command command-wait event to wait on fill of new value + ASSERT_SUCCESS(urCommandBufferUpdateWaitEventsExp(command_handles[0], 1, + &external_events[2])); + + // Get a new signal event for command-buffer + ASSERT_SUCCESS(urCommandBufferUpdateSignalEventExp(command_handles[0], + &external_events[3])); + + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urEnqueueMemBufferRead( + queue, buffers[0], false, 0, allocation_size, host_enqueue_ptr.data(), + 1, &external_events[3], nullptr)); + + // Verify update + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternY); + } +} + +TEST_P(CommandEventSyncUpdateTest, MemBufferFillExp) { + // Get wait event from queue fill on buffer 0 + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternX, + sizeof(patternX), 0, allocation_size, + 0, nullptr, &external_events[0])); + + // Test fill command overwriting buffer 0 based on queue event + uint32_t patternY = 0xA; + ASSERT_SUCCESS(urCommandBufferAppendMemBufferFillExp( + updatable_cmd_buf_handle, buffers[0], &patternY, sizeof(patternY), 0, + allocation_size, 0, nullptr, 1, &external_events[0], nullptr, + &external_events[1], &command_handles[0])); + ASSERT_NE(nullptr, command_handles[0]); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + // Queue read buffer 0 based on event returned from command-buffer command + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueMemBufferRead( + queue, buffers[0], false, 0, allocation_size, host_enqueue_ptr.data(), + 1, &external_events[1], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternY); + } + + uint32_t patternZ = 666; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffers[0], &patternZ, + sizeof(patternZ), 0, allocation_size, + 0, nullptr, &external_events[2])); + + // Update command command-wait event to wait on fill of new value + ASSERT_SUCCESS(urCommandBufferUpdateWaitEventsExp(command_handles[0], 1, + &external_events[2])); + + // Get a new signal event for command-buffer + ASSERT_SUCCESS(urCommandBufferUpdateSignalEventExp(command_handles[0], + &external_events[3])); + + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urEnqueueMemBufferRead( + queue, buffers[0], false, 0, allocation_size, host_enqueue_ptr.data(), + 1, &external_events[3], nullptr)); + + // Verify update + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternY); + } +} + +TEST_P(CommandEventSyncUpdateTest, USMPrefetchExp) { + // Get wait event from queue fill on ptr 0 + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternX), + &patternX, allocation_size, 0, nullptr, + &external_events[0])); + + // Test prefetch command waiting on queue event + ASSERT_SUCCESS(urCommandBufferAppendUSMPrefetchExp( + updatable_cmd_buf_handle, device_ptrs[1], allocation_size, + 0 /* migration flags*/, 0, nullptr, 1, &external_events[0], nullptr, + &external_events[1], &command_handles[0])); + ASSERT_NE(nullptr, command_handles[0]); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + // Queue read ptr 0 based on event returned from command-buffer command + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptr.data(), + device_ptrs[0], allocation_size, 1, + &external_events[1], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternX); + } + + uint32_t patternY = 42; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternY), + &patternY, allocation_size, 0, nullptr, + &external_events[2])); + + // Update command command-wait event to wait on fill of new value + ASSERT_SUCCESS(urCommandBufferUpdateWaitEventsExp(command_handles[0], 1, + &external_events[2])); + + // Get a new signal event for command-buffer + ASSERT_SUCCESS(urCommandBufferUpdateSignalEventExp(command_handles[0], + &external_events[3])); + + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptr.data(), + device_ptrs[0], allocation_size, 1, + &external_events[3], nullptr)); + + // Verify update + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternY); + } +} + +TEST_P(CommandEventSyncUpdateTest, USMAdviseExp) { + // Get wait event from queue fill on ptr 0 + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternX), + &patternX, allocation_size, 0, nullptr, + &external_events[0])); + + // Test advise command waiting on queue event + ASSERT_SUCCESS(urCommandBufferAppendUSMAdviseExp( + updatable_cmd_buf_handle, device_ptrs[0], allocation_size, + 0 /* advice flags*/, 0, nullptr, 1, &external_events[0], nullptr, + &external_events[1], &command_handles[0])); + ASSERT_NE(nullptr, command_handles[0]); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + // Queue read ptr 0 based on event returned from command-buffer command + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptr.data(), + device_ptrs[0], allocation_size, 1, + &external_events[1], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternX); + } + + uint32_t patternY = 42; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternY), + &patternY, allocation_size, 0, nullptr, + &external_events[2])); + + // Update command command-wait event to wait on fill of new value + ASSERT_SUCCESS(urCommandBufferUpdateWaitEventsExp(command_handles[0], 1, + &external_events[2])); + + // Get a new signal event for command-buffer + ASSERT_SUCCESS(urCommandBufferUpdateSignalEventExp(command_handles[0], + &external_events[3])); + + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptr.data(), + device_ptrs[0], allocation_size, 1, + &external_events[3], nullptr)); + + // Verify update + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptr[i], patternY); + } +} + +TEST_P(CommandEventSyncUpdateTest, MultipleEventCommands) { + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternX), + &patternX, allocation_size, 0, nullptr, + &external_events[0])); + uint32_t patternY = 43; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[1], sizeof(patternY), + &patternY, allocation_size, 0, nullptr, + &external_events[1])); + + uint32_t patternZ = 44; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[2], sizeof(patternZ), + &patternZ, allocation_size, 0, nullptr, + &external_events[2])); + + // Command to fill ptr 0 + uint32_t patternA = 0xA; + ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp( + updatable_cmd_buf_handle, device_ptrs[0], &patternA, sizeof(patternA), + allocation_size, 0, nullptr, 1, &external_events[0], nullptr, + &external_events[3], &command_handles[0])); + + // Command to fill ptr 1 + uint32_t patternB = 0xB; + ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp( + updatable_cmd_buf_handle, device_ptrs[1], &patternB, sizeof(patternB), + allocation_size, 0, nullptr, 1, &external_events[1], nullptr, + &external_events[4], &command_handles[1])); + + // Command to fill ptr 1 + uint32_t patternC = 0xC; + ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp( + updatable_cmd_buf_handle, device_ptrs[2], &patternC, sizeof(patternC), + allocation_size, 0, nullptr, 1, &external_events[2], nullptr, + &external_events[5], &command_handles[2])); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + // Queue read ptr 1 based on event returned from command-buffer command + std::array host_enqueue_ptrA1, host_enqueue_ptrB1, + host_enqueue_ptrC1; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptrA1.data(), + device_ptrs[0], allocation_size, 1, + &external_events[3], nullptr)); + + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptrB1.data(), + device_ptrs[1], allocation_size, 1, + &external_events[4], nullptr)); + + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptrC1.data(), + device_ptrs[2], allocation_size, 1, + &external_events[5], nullptr)); + + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptrA1[i], patternA); + ASSERT_EQ(host_enqueue_ptrB1[i], patternB); + ASSERT_EQ(host_enqueue_ptrC1[i], patternC); + } + + uint32_t pattern1 = 1; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(pattern1), + &pattern1, allocation_size, 0, nullptr, + &external_events[6])); + uint32_t pattern2 = 2; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[1], sizeof(pattern2), + &pattern2, allocation_size, 0, nullptr, + &external_events[7])); + + uint32_t pattern3 = 3; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[2], sizeof(pattern3), + &pattern3, allocation_size, 0, nullptr, + &external_events[8])); + + // Update command command-wait events to wait on new values + ASSERT_SUCCESS(urCommandBufferUpdateWaitEventsExp(command_handles[0], 1, + &external_events[6])); + ASSERT_SUCCESS(urCommandBufferUpdateWaitEventsExp(command_handles[0], 1, + &external_events[7])); + ASSERT_SUCCESS(urCommandBufferUpdateWaitEventsExp(command_handles[0], 1, + &external_events[8])); + + // Get a new signal events for command-buffer + ASSERT_SUCCESS(urCommandBufferUpdateSignalEventExp(command_handles[0], + &external_events[9])); + ASSERT_SUCCESS(urCommandBufferUpdateSignalEventExp(command_handles[0], + &external_events[10])); + ASSERT_SUCCESS(urCommandBufferUpdateSignalEventExp(command_handles[0], + &external_events[11])); + + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + std::array host_enqueue_ptrA2, host_enqueue_ptrB2, + host_enqueue_ptrC2; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptrA2.data(), + device_ptrs[0], allocation_size, 1, + &external_events[9], nullptr)); + + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptrB2.data(), + device_ptrs[1], allocation_size, 1, + &external_events[10], nullptr)); + + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptrC2.data(), + device_ptrs[2], allocation_size, 1, + &external_events[11], nullptr)); + // Verify + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + ASSERT_EQ(host_enqueue_ptrA2[i], patternA); + ASSERT_EQ(host_enqueue_ptrB2[i], patternB); + ASSERT_EQ(host_enqueue_ptrC2[i], patternC); + } +} diff --git a/test/conformance/exp_command_buffer/invalid_update.cpp b/test/conformance/exp_command_buffer/update/invalid_update.cpp similarity index 62% rename from test/conformance/exp_command_buffer/invalid_update.cpp rename to test/conformance/exp_command_buffer/update/invalid_update.cpp index afcb279fa9..036e9a464c 100644 --- a/test/conformance/exp_command_buffer/invalid_update.cpp +++ b/test/conformance/exp_command_buffer/update/invalid_update.cpp @@ -3,7 +3,8 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include "fixtures.h" +#include "../fixtures.h" +#include #include // Negative tests that correct error codes are thrown on invalid update usage. @@ -36,7 +37,8 @@ struct InvalidUpdateTest // Append kernel command to command-buffer ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &global_size, &local_size, 0, nullptr, nullptr, &command_handle)); + &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr, + nullptr, nullptr, &command_handle)); ASSERT_NE(command_handle, nullptr); } @@ -64,7 +66,7 @@ struct InvalidUpdateTest static constexpr size_t local_size = 4; static constexpr size_t global_size = 32; static constexpr size_t global_offset = 0; - static constexpr size_t n_dimensions = 1; + static constexpr uint32_t n_dimensions = 1; static constexpr size_t allocation_size = sizeof(val) * global_size; void *shared_ptr = nullptr; ur_exp_command_buffer_command_handle_t command_handle = nullptr; @@ -89,10 +91,11 @@ TEST_P(InvalidUpdateTest, NotFinalizedCommandBuffer) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 0, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList nullptr, // pNewPointerArgList &new_input_desc, // pNewValueArgList @@ -116,11 +119,16 @@ TEST_P(InvalidUpdateTest, NotUpdatableCommandBuffer) { EXPECT_NE(test_cmd_buf_handle, nullptr); // Append a kernel commands to command-buffer and close command-buffer + // Should be an error because we are trying to get command handle but + // command buffer is not updatable. ur_exp_command_buffer_command_handle_t test_command_handle = nullptr; - EXPECT_SUCCESS(urCommandBufferAppendKernelLaunchExp( - test_cmd_buf_handle, kernel, n_dimensions, &global_offset, &global_size, - &local_size, 0, nullptr, nullptr, &test_command_handle)); - EXPECT_NE(test_command_handle, nullptr); + ASSERT_EQ_RESULT(urCommandBufferAppendKernelLaunchExp( + test_cmd_buf_handle, kernel, n_dimensions, + &global_offset, &global_size, &local_size, 0, nullptr, + 0, nullptr, 0, nullptr, nullptr, nullptr, + &test_command_handle), + UR_RESULT_ERROR_INVALID_OPERATION); + ASSERT_EQ(test_command_handle, nullptr); EXPECT_SUCCESS(urCommandBufferFinalizeExp(test_cmd_buf_handle)); finalized = true; @@ -139,10 +147,11 @@ TEST_P(InvalidUpdateTest, NotUpdatableCommandBuffer) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 0, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList nullptr, // pNewPointerArgList &new_input_desc, // pNewValueArgList @@ -151,11 +160,11 @@ TEST_P(InvalidUpdateTest, NotUpdatableCommandBuffer) { nullptr, // pNewLocalWorkSize }; - // Update command to command-buffer that doesn't have updatable set should - // be an error + // Since no command handle was returned Update command to command-buffer + // should also be an error. ur_result_t result = urCommandBufferUpdateKernelLaunchExp(test_command_handle, &update_desc); - EXPECT_EQ(UR_RESULT_ERROR_INVALID_OPERATION, result); + EXPECT_EQ(UR_RESULT_ERROR_INVALID_NULL_HANDLE, result); if (test_command_handle) { EXPECT_SUCCESS(urCommandBufferReleaseCommandExp(test_command_handle)); @@ -165,98 +174,53 @@ TEST_P(InvalidUpdateTest, NotUpdatableCommandBuffer) { } } -// Test setting `pNewLocalWorkSize` to a non-NULL value and `pNewGlobalWorkSize` -// to NULL gives the correct error. -TEST_P(InvalidUpdateTest, GlobalLocalSizeMistach) { +// If the number of dimensions change, then the global work size and offset +// also need to be updated. +TEST_P(InvalidUpdateTest, InvalidDimensions) { ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); finalized = true; - size_t new_local_size = 16; - ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { - UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype - nullptr, // pNext - 0, // numNewMemObjArgs - 0, // numNewPointerArgs - 0, // numNewValueArgs - n_dimensions, // newWorkDim - nullptr, // pNewMemObjArgList - nullptr, // pNewPointerArgList - nullptr, // pNewValueArgList - nullptr, // pNewGlobalWorkOffset - nullptr, // pNewGlobalWorkSize - &new_local_size, // pNewLocalWorkSize - }; + uint32_t new_dimensions = 2; + std::array new_global_offset{0, 0}; + std::array new_global_size{64, 64}; - // Update command local size but not global size - ur_result_t result = - urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc); - ASSERT_EQ(UR_RESULT_ERROR_INVALID_OPERATION, result); -} - -// Test setting `pNewLocalWorkSize` to a non-NULL value when the command was -// created with a NULL local work size gives the correct error. -TEST_P(InvalidUpdateTest, ImplToUserDefinedLocalSize) { - // Append kernel command to command-buffer using NULL local work size - ur_exp_command_buffer_command_handle_t second_command_handle = nullptr; - ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( - updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &global_size, nullptr, 0, nullptr, nullptr, &second_command_handle)); - ASSERT_NE(second_command_handle, nullptr); - - EXPECT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); - finalized = true; - - size_t new_global_size = 64; - size_t new_local_size = 16; ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext - 0, // numNewMemObjArgs - 0, // numNewPointerArgs - 0, // numNewValueArgs - n_dimensions, // newWorkDim - nullptr, // pNewMemObjArgList - nullptr, // pNewPointerArgList - nullptr, // pNewValueArgList - nullptr, // pNewGlobalWorkOffset - &new_global_size, // pNewGlobalWorkSize - &new_local_size, // pNewLocalWorkSize + kernel, // hNewKernel + 0, // numNewMemObjArgs + 0, // numNewPointerArgs + 0, // numNewValueArgs + new_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + nullptr, // pNewPointerArgList + nullptr, // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + new_global_size.data(), // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize }; - // Update command local size to non-NULL when created with NULL value - ur_result_t result = urCommandBufferUpdateKernelLaunchExp( - second_command_handle, &update_desc); - EXPECT_EQ(UR_RESULT_ERROR_INVALID_OPERATION, result); - - if (second_command_handle) { - EXPECT_SUCCESS(urCommandBufferReleaseCommandExp(second_command_handle)); - } -} + ASSERT_EQ( + UR_RESULT_ERROR_INVALID_VALUE, + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); -// Test setting `pNewLocalWorkSize` to a NULL value when the command was -// created with a non-NULL local work size gives the correct error. -TEST_P(InvalidUpdateTest, UserToImplDefinedLocalSize) { - ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); - finalized = true; - - size_t new_global_size = 64; - ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { + update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext - 0, // numNewMemObjArgs - 0, // numNewPointerArgs - 0, // numNewValueArgs - n_dimensions, // newWorkDim - nullptr, // pNewMemObjArgList - nullptr, // pNewPointerArgList - nullptr, // pNewValueArgList - nullptr, // pNewGlobalWorkOffset - &new_global_size, // pNewGlobalWorkSize - nullptr, // pNewLocalWorkSize + kernel, // hNewKernel + 0, // numNewMemObjArgs + 0, // numNewPointerArgs + 0, // numNewValueArgs + new_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + nullptr, // pNewPointerArgList + nullptr, // pNewValueArgList + new_global_offset.data(), // pNewGlobalWorkOffset + nullptr, // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize }; - // Update command local size to NULL when created with non-NULL value - ur_result_t result = - urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc); - ASSERT_EQ(UR_RESULT_ERROR_INVALID_OPERATION, result); + ASSERT_EQ( + UR_RESULT_ERROR_INVALID_VALUE, + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); } diff --git a/test/conformance/exp_command_buffer/update/kernel_event_sync.cpp b/test/conformance/exp_command_buffer/update/kernel_event_sync.cpp new file mode 100644 index 0000000000..c7a6713da7 --- /dev/null +++ b/test/conformance/exp_command_buffer/update/kernel_event_sync.cpp @@ -0,0 +1,305 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "../fixtures.h" +#include + +struct KernelCommandEventSyncUpdateTest + : uur::command_buffer::urUpdatableCommandBufferExpExecutionTest { + void SetUp() override { + program_name = "saxpy_usm"; + UUR_RETURN_ON_FATAL_FAILURE( + urUpdatableCommandBufferExpExecutionTest::SetUp()); + + UUR_RETURN_ON_FATAL_FAILURE( + uur::command_buffer::checkCommandBufferUpdateSupport( + device, + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_EVENTS)); + + for (auto &device_ptr : device_ptrs) { + ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr, + allocation_size, &device_ptr)); + ASSERT_NE(device_ptr, nullptr); + } + + // Index 0 is output + ASSERT_SUCCESS( + urKernelSetArgPointer(kernel, 0, nullptr, device_ptrs[2])); + // Index 1 is A + ASSERT_SUCCESS(urKernelSetArgValue(kernel, 1, sizeof(A), nullptr, &A)); + // Index 2 is X + ASSERT_SUCCESS( + urKernelSetArgPointer(kernel, 2, nullptr, device_ptrs[0])); + // Index 3 is Y + ASSERT_SUCCESS( + urKernelSetArgPointer(kernel, 3, nullptr, device_ptrs[1])); + } + + virtual void TearDown() override { + for (auto &device_ptr : device_ptrs) { + if (device_ptr) { + EXPECT_SUCCESS(urUSMFree(context, device_ptr)); + } + } + + for (auto &event : external_events) { + if (event) { + EXPECT_SUCCESS(urEventRelease(event)); + } + } + + if (command_handle) { + EXPECT_SUCCESS(urCommandBufferReleaseCommandExp(command_handle)); + } + + UUR_RETURN_ON_FATAL_FAILURE( + urUpdatableCommandBufferExpExecutionTest::TearDown()); + } + + // First two device pointers are inputs to be tested, last is the output + // from the saxpy kernel command. + std::array device_ptrs = {nullptr, nullptr, nullptr}; + std::array external_events = {nullptr, nullptr, + nullptr, nullptr}; + std::array sync_points = {0, 0}; + ur_exp_command_buffer_command_handle_t command_handle = nullptr; + static constexpr size_t elements = 64; + static constexpr size_t global_offset = 0; + static constexpr size_t allocation_size = sizeof(uint32_t) * elements; + static constexpr size_t A = 2; +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(KernelCommandEventSyncUpdateTest); + +// Tests updating the signal and wait event dependencies of the saxpy +// command in a command-buffer. +TEST_P(KernelCommandEventSyncUpdateTest, Basic) { + // Initialize data X with queue submission + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternX), + &patternX, allocation_size, 0, nullptr, + &external_events[0])); + + // Initialize data Y with command-buffer command + uint32_t patternY = 0xA; + ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp( + updatable_cmd_buf_handle, device_ptrs[1], &patternY, sizeof(patternY), + allocation_size, 0, nullptr, 0, nullptr, &sync_points[0], nullptr, + nullptr)); + + // Kernel command for SAXPY waiting on command and signal event + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, 1, &global_offset, &elements, nullptr, + 0, nullptr, 1, &sync_points[0], 1, &external_events[0], &sync_points[1], + &external_events[1], &command_handle)); + ASSERT_NE(command_handle, nullptr); + + // command-buffer command that reads output to host + std::array host_command_ptr{}; + ASSERT_SUCCESS(urCommandBufferAppendUSMMemcpyExp( + updatable_cmd_buf_handle, host_command_ptr.data(), device_ptrs[2], + allocation_size, 1, &sync_points[1], 0, nullptr, nullptr, nullptr, + nullptr)); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + // Queue command that reads output to host + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptr.data(), + device_ptrs[2], allocation_size, 1, + &external_events[1], nullptr)); + + ASSERT_SUCCESS(urQueueFinish(queue)); + + for (size_t i = 0; i < elements; i++) { + auto ref = (patternX * A) + patternY; + ASSERT_EQ(host_command_ptr[i], ref); + ASSERT_EQ(host_enqueue_ptr[i], ref); + } + + // Reset output data + std::memset(host_command_ptr.data(), 0, allocation_size); + std::memset(host_enqueue_ptr.data(), 0, allocation_size); + + // Set data X to new value with queue submission + patternX = 0xBEEF; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternX), + &patternX, allocation_size, 0, nullptr, + &external_events[2])); + + // Update kernel command-wait event to wait on fill of new x value + ASSERT_SUCCESS(urCommandBufferUpdateWaitEventsExp(command_handle, 1, + &external_events[2])); + + // Get a new signal event for command-buffer + ASSERT_SUCCESS(urCommandBufferUpdateSignalEventExp(command_handle, + &external_events[3])); + + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + // Read data back with a queue operation waiting on updated kernel command + // signal event. + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptr.data(), + device_ptrs[2], allocation_size, 1, + &external_events[3], nullptr)); + + // Verify results + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + auto ref = (patternX * A) + patternY; + ASSERT_EQ(host_command_ptr[i], ref); + ASSERT_EQ(host_enqueue_ptr[i], ref); + } +} + +// Test updating wait events to a command with multiple wait events +TEST_P(KernelCommandEventSyncUpdateTest, TwoWaitEvents) { + // Initialize data X with queue submission + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternX), + &patternX, allocation_size, 0, nullptr, + &external_events[0])); + + // Initialize data Y with command-buffer command + uint32_t patternY = 0xA; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[1], sizeof(patternY), + &patternY, allocation_size, 0, nullptr, + &external_events[1])); + + // Kernel command for SAXPY waiting on command and signal event + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, 1, &global_offset, &elements, nullptr, + 0, nullptr, 0, nullptr, 2, &external_events[0], &sync_points[0], + &external_events[2], &command_handle)); + ASSERT_NE(command_handle, nullptr); + + // command-buffer command that reads output to host + std::array host_command_ptr{}; + ASSERT_SUCCESS(urCommandBufferAppendUSMMemcpyExp( + updatable_cmd_buf_handle, host_command_ptr.data(), device_ptrs[2], + allocation_size, 1, &sync_points[0], 0, nullptr, nullptr, nullptr, + nullptr)); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + // Queue command that reads output to host + std::array host_enqueue_ptr{}; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptr.data(), + device_ptrs[2], allocation_size, 1, + &external_events[2], nullptr)); + + ASSERT_SUCCESS(urQueueFinish(queue)); + + for (size_t i = 0; i < elements; i++) { + auto ref = (patternX * A) + patternY; + ASSERT_EQ(host_command_ptr[i], ref); + ASSERT_EQ(host_enqueue_ptr[i], ref); + } + + // Reset output data + std::memset(host_command_ptr.data(), 0, allocation_size); + std::memset(host_enqueue_ptr.data(), 0, allocation_size); + + // Set data X to new value with queue submission + patternX = 0xBEEF; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternX), + &patternX, allocation_size, 0, nullptr, + &external_events[3])); + + // Set data X to new value with queue submission + patternY = 0xBAD; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[1], sizeof(patternY), + &patternY, allocation_size, 0, nullptr, + &external_events[4])); + + // Update kernel command-wait event to wait on fill of new x value + ASSERT_SUCCESS(urCommandBufferUpdateWaitEventsExp(command_handle, 2, + &external_events[3])); + + // Get a new signal event for command-buffer + ASSERT_SUCCESS(urCommandBufferUpdateSignalEventExp(command_handle, + &external_events[5])); + + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + // Read data back with a queue operation waiting on updated kernel command + // signal event. + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_enqueue_ptr.data(), + device_ptrs[2], allocation_size, 1, + &external_events[5], nullptr)); + + // Verify results + ASSERT_SUCCESS(urQueueFinish(queue)); + for (size_t i = 0; i < elements; i++) { + auto ref = (patternX * A) + patternY; + ASSERT_EQ(host_command_ptr[i], ref); + ASSERT_EQ(host_enqueue_ptr[i], ref); + } +} + +// Tests the correct error is returned when a different number +// of wait events is passed during update. +TEST_P(KernelCommandEventSyncUpdateTest, InvalidWaitUpdate) { + // Initialize data X with queue submission + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternX), + &patternX, allocation_size, 0, nullptr, + &external_events[0])); + + // Initialize data Y with queue submission + uint32_t patternY = 0xA; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[1], sizeof(patternY), + &patternY, allocation_size, 0, nullptr, + &external_events[1])); + + // Initialize data Z with queue submission + int32_t zero_pattern = 0; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[2], sizeof(zero_pattern), + &zero_pattern, allocation_size, 0, nullptr, + &external_events[2])); + + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, 1, &global_offset, &elements, nullptr, + 0, nullptr, 0, nullptr, 1, &external_events[0], nullptr, nullptr, + &command_handle)); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + + // Increase number of events should be an error + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST, + urCommandBufferUpdateWaitEventsExp(command_handle, 2, + &external_events[1])); + + // decrease number of events should be an error + ASSERT_EQ_RESULT( + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST, + urCommandBufferUpdateWaitEventsExp(command_handle, 0, nullptr)); +} + +// Tests the correct error is returned when trying to update the +// signal event from a command that was not created with one. +TEST_P(KernelCommandEventSyncUpdateTest, InvalidSignalUpdate) { + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, 1, &global_offset, &elements, nullptr, + 0, nullptr, 0, nullptr, 0, nullptr, nullptr, nullptr, &command_handle)); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + + uint32_t patternX = 42; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptrs[0], sizeof(patternX), + &patternX, allocation_size, 0, nullptr, + &external_events[0])); + + // Increase number of events should be an error + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_OPERATION, + urCommandBufferUpdateSignalEventExp(command_handle, + &external_events[0])); +} diff --git a/test/conformance/exp_command_buffer/update/kernel_handle_update.cpp b/test/conformance/exp_command_buffer/update/kernel_handle_update.cpp new file mode 100644 index 0000000000..c74af937f6 --- /dev/null +++ b/test/conformance/exp_command_buffer/update/kernel_handle_update.cpp @@ -0,0 +1,475 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "../fixtures.h" +#include "uur/raii.h" +#include +#include + +struct TestSaxpyKernel : public uur::command_buffer::TestKernel { + + TestSaxpyKernel(ur_platform_handle_t Platform, ur_context_handle_t Context, + ur_device_handle_t Device) + : TestKernel("saxpy_usm", Platform, Context, Device) {} + + ~TestSaxpyKernel() override = default; + + void setUpKernel() override { + + ASSERT_NO_FATAL_FAILURE(buildKernel()); + + const size_t AllocationSize = sizeof(uint32_t) * GlobalSize; + for (auto &SharedPtr : Allocations) { + ASSERT_SUCCESS(urUSMSharedAlloc(Context, Device, nullptr, nullptr, + AllocationSize, &SharedPtr)); + ASSERT_NE(SharedPtr, nullptr); + + std::vector pattern(AllocationSize); + uur::generateMemFillPattern(pattern); + std::memcpy(SharedPtr, pattern.data(), AllocationSize); + } + + // Index 0 is the output + ASSERT_SUCCESS( + urKernelSetArgPointer(Kernel, 0, nullptr, Allocations[0])); + // Index 1 is A + ASSERT_SUCCESS(urKernelSetArgValue(Kernel, 1, sizeof(A), nullptr, &A)); + // Index 2 is X + ASSERT_SUCCESS( + urKernelSetArgPointer(Kernel, 2, nullptr, Allocations[1])); + // Index 3 is Y + ASSERT_SUCCESS( + urKernelSetArgPointer(Kernel, 3, nullptr, Allocations[2])); + + UpdatePointerDesc[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 2, // argIndex + nullptr, // pProperties + &Allocations[0], // pArgValue + }; + + UpdatePointerDesc[1] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 2, // argIndex + nullptr, // pProperties + &Allocations[1], // pArgValue + }; + + UpdatePointerDesc[2] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 3, // argIndex + nullptr, // pProperties + &Allocations[2], // pArgValue + }; + + UpdateValDesc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 1, // argIndex + sizeof(A), // argSize + nullptr, // pProperties + &A, // hArgValue + }; + + UpdateDesc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + Kernel, // hNewKernel + 0, // numNewMemObjArgs + 3, // numNewPointerArgs + 1, // numNewValueArgs + NDimensions, // newWorkDim + nullptr, // pNewMemObjArgList + UpdatePointerDesc.data(), // pNewPointerArgList + &UpdateValDesc, // pNewValueArgList + &GlobalOffset, // pNewGlobalWorkOffset + &GlobalSize, // pNewGlobalWorkSize + &LocalSize, // pNewLocalWorkSize + }; + } + + void destroyKernel() override { + for (auto &Allocation : Allocations) { + if (Allocation) { + EXPECT_SUCCESS(urUSMFree(Context, Allocation)); + } + } + ASSERT_NO_FATAL_FAILURE(TestKernel::destroyKernel()); + } + + void validate() override { + auto *output = static_cast(Allocations[0]); + auto *X = static_cast(Allocations[1]); + auto *Y = static_cast(Allocations[2]); + + for (size_t i = 0; i < GlobalSize; i++) { + uint32_t result = A * X[i] + Y[i]; + ASSERT_EQ(result, output[i]); + } + } + + std::array + UpdatePointerDesc; + ur_exp_command_buffer_update_value_arg_desc_t UpdateValDesc; + ur_exp_command_buffer_update_kernel_launch_desc_t UpdateDesc; + + size_t LocalSize = 4; + size_t GlobalSize = 32; + size_t GlobalOffset = 0; + uint32_t NDimensions = 1; + uint32_t A = 42; + + std::array Allocations = {nullptr, nullptr, nullptr}; +}; + +struct TestFill2DKernel : public uur::command_buffer::TestKernel { + + TestFill2DKernel(ur_platform_handle_t Platform, ur_context_handle_t Context, + ur_device_handle_t Device) + : TestKernel("fill_usm_2d", Platform, Context, Device) {} + + ~TestFill2DKernel() override = default; + + void setUpKernel() override { + ASSERT_NO_FATAL_FAILURE(buildKernel()); + + const size_t allocation_size = sizeof(uint32_t) * SizeX * SizeY; + ASSERT_SUCCESS(urUSMSharedAlloc(Context, Device, nullptr, nullptr, + allocation_size, &Memory)); + + // Index 0 is the output + ASSERT_SUCCESS(urKernelSetArgPointer(Kernel, 0, nullptr, Memory)); + // Index 1 is the fill value + ASSERT_SUCCESS( + urKernelSetArgValue(Kernel, 1, sizeof(Val), nullptr, &Val)); + + ASSERT_NE(Memory, nullptr); + + std::vector pattern(allocation_size); + uur::generateMemFillPattern(pattern); + std::memcpy(Memory, pattern.data(), allocation_size); + + UpdatePointerDesc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 0, // argIndex + nullptr, // pProperties + &Memory, // pArgValue + }; + + UpdateValDesc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 1, // argIndex + sizeof(Val), // argSize + nullptr, // pProperties + &Val, // hArgValue + }; + + UpdateDesc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + Kernel, // hNewKernel + 0, // numNewMemObjArgs + 1, // numNewPointerArgs + 1, // numNewValueArgs + NDimensions, // newWorkDim + nullptr, // pNewMemObjArgList + &UpdatePointerDesc, // pNewPointerArgList + &UpdateValDesc, // pNewValueArgList + GlobalOffset.data(), // pNewGlobalWorkOffset + GlobalSize.data(), // pNewGlobalWorkSize + LocalSize.data(), // pNewLocalWorkSize + }; + } + + void destroyKernel() override { + if (Memory) { + EXPECT_SUCCESS(urUSMFree(Context, Memory)); + } + ASSERT_NO_FATAL_FAILURE(TestKernel::destroyKernel()); + } + + void validate() override { + for (size_t i = 0; i < SizeX * SizeY; i++) { + ASSERT_EQ(static_cast(Memory)[i], Val); + } + } + + ur_exp_command_buffer_update_pointer_arg_desc_t UpdatePointerDesc; + ur_exp_command_buffer_update_value_arg_desc_t UpdateValDesc; + ur_exp_command_buffer_update_kernel_launch_desc_t UpdateDesc; + + std::vector LocalSize = {4, 4}; + const size_t SizeX = 64; + const size_t SizeY = 64; + std::vector GlobalSize = {SizeX, SizeY}; + std::vector GlobalOffset = {0, 0}; + uint32_t NDimensions = 2; + + void *Memory; + uint32_t Val = 42; +}; + +struct urCommandBufferKernelHandleUpdateTest + : uur::command_buffer::urCommandBufferMultipleKernelUpdateTest { + virtual void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE( + uur::command_buffer::urCommandBufferMultipleKernelUpdateTest:: + SetUp()); + + UUR_RETURN_ON_FATAL_FAILURE( + uur::command_buffer::checkCommandBufferUpdateSupport( + device, + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE)); + + ur_device_usm_access_capability_flags_t shared_usm_flags; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; + } + + SaxpyKernel = std::make_shared( + TestSaxpyKernel(platform, context, device)); + FillUSM2DKernel = std::make_shared( + TestFill2DKernel(platform, context, device)); + TestKernels.push_back(SaxpyKernel); + TestKernels.push_back(FillUSM2DKernel); + + this->setUpKernels(); + } + + virtual void TearDown() override { + UUR_RETURN_ON_FATAL_FAILURE( + uur::command_buffer::urCommandBufferMultipleKernelUpdateTest:: + TearDown()); + } + + std::shared_ptr SaxpyKernel; + std::shared_ptr FillUSM2DKernel; +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCommandBufferKernelHandleUpdateTest); + +/* Tests that it is possible to update the kernel handle of a command-buffer node. + * This test launches a Saxpy kernel using a command-buffer and then updates the + * node with a completely different kernel that does a fill 2D operation. */ +TEST_P(urCommandBufferKernelHandleUpdateTest, Success) { + + std::vector KernelAlternatives = { + FillUSM2DKernel->Kernel}; + + uur::raii::CommandBufferCommand CommandHandle; + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, SaxpyKernel->Kernel, SaxpyKernel->NDimensions, + &(SaxpyKernel->GlobalOffset), &(SaxpyKernel->GlobalSize), + &(SaxpyKernel->LocalSize), KernelAlternatives.size(), + KernelAlternatives.data(), 0, nullptr, 0, nullptr, nullptr, nullptr, + CommandHandle.ptr())); + ASSERT_NE(CommandHandle, nullptr); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp( + CommandHandle, &FillUSM2DKernel->UpdateDesc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + ASSERT_NO_FATAL_FAILURE(SaxpyKernel->validate()); + ASSERT_NO_FATAL_FAILURE(FillUSM2DKernel->validate()); +} + +/* Test that updates to the command kernel handle are stored in the command handle */ +TEST_P(urCommandBufferKernelHandleUpdateTest, UpdateAgain) { + + std::vector KernelAlternatives = { + FillUSM2DKernel->Kernel}; + + uur::raii::CommandBufferCommand CommandHandle; + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, SaxpyKernel->Kernel, SaxpyKernel->NDimensions, + &(SaxpyKernel->GlobalOffset), &(SaxpyKernel->GlobalSize), + &(SaxpyKernel->LocalSize), KernelAlternatives.size(), + KernelAlternatives.data(), 0, nullptr, 0, nullptr, nullptr, nullptr, + CommandHandle.ptr())); + ASSERT_NE(CommandHandle, nullptr); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp( + CommandHandle, &FillUSM2DKernel->UpdateDesc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + ASSERT_NO_FATAL_FAILURE(SaxpyKernel->validate()); + ASSERT_NO_FATAL_FAILURE(FillUSM2DKernel->validate()); + + // If the Kernel was not stored properly in the command, then this could potentially fail since + // it would try to use the Saxpy kernel + FillUSM2DKernel->Val = 78; + ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp( + CommandHandle, &FillUSM2DKernel->UpdateDesc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + ASSERT_NO_FATAL_FAILURE(FillUSM2DKernel->validate()); +} + +/* Test that it is possible to change the kernel handle in a command and later restore it to the original handle */ +TEST_P(urCommandBufferKernelHandleUpdateTest, RestoreOriginalKernel) { + + std::vector KernelAlternatives = { + FillUSM2DKernel->Kernel}; + + uur::raii::CommandBufferCommand CommandHandle; + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, SaxpyKernel->Kernel, SaxpyKernel->NDimensions, + &(SaxpyKernel->GlobalOffset), &(SaxpyKernel->GlobalSize), + &(SaxpyKernel->LocalSize), KernelAlternatives.size(), + KernelAlternatives.data(), 0, nullptr, 0, nullptr, nullptr, nullptr, + CommandHandle.ptr())); + ASSERT_NE(CommandHandle, nullptr); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp( + CommandHandle, &FillUSM2DKernel->UpdateDesc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + ASSERT_NO_FATAL_FAILURE(SaxpyKernel->validate()); + ASSERT_NO_FATAL_FAILURE(FillUSM2DKernel->validate()); + + // Updating A, so that the second launch of the saxpy kernel actually has a different output. + SaxpyKernel->A = 20; + ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp( + CommandHandle, &SaxpyKernel->UpdateDesc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + ASSERT_NO_FATAL_FAILURE(SaxpyKernel->validate()); +} + +TEST_P(urCommandBufferKernelHandleUpdateTest, KernelAlternativeNotRegistered) { + + uur::raii::CommandBufferCommand CommandHandle; + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, SaxpyKernel->Kernel, SaxpyKernel->NDimensions, + &(SaxpyKernel->GlobalOffset), &(SaxpyKernel->GlobalSize), + &(SaxpyKernel->LocalSize), 0, nullptr, 0, nullptr, 0, nullptr, nullptr, + nullptr, CommandHandle.ptr())); + ASSERT_NE(CommandHandle, nullptr); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_VALUE, + urCommandBufferUpdateKernelLaunchExp( + CommandHandle, &FillUSM2DKernel->UpdateDesc)); +} + +TEST_P(urCommandBufferKernelHandleUpdateTest, + RegisterInvalidKernelAlternative) { + + std::vector KernelAlternatives = {SaxpyKernel->Kernel}; + + ur_exp_command_buffer_command_handle_t CommandHandle; + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_VALUE, + urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, SaxpyKernel->Kernel, + SaxpyKernel->NDimensions, &(SaxpyKernel->GlobalOffset), + &(SaxpyKernel->GlobalSize), &(SaxpyKernel->LocalSize), + KernelAlternatives.size(), KernelAlternatives.data(), + 0, nullptr, 0, nullptr, nullptr, nullptr, + &CommandHandle)); +} + +using urCommandBufferValidUpdateParametersTest = + urCommandBufferKernelHandleUpdateTest; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCommandBufferValidUpdateParametersTest); + +// Test that updating the dimensions of a kernel command does not cause an error. +TEST_P(urCommandBufferValidUpdateParametersTest, + UpdateDimensionsWithoutUpdatingKernel) { + + uur::raii::CommandBufferCommand CommandHandle; + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, FillUSM2DKernel->Kernel, + FillUSM2DKernel->NDimensions, FillUSM2DKernel->GlobalOffset.data(), + FillUSM2DKernel->GlobalSize.data(), FillUSM2DKernel->LocalSize.data(), + 0, nullptr, 0, nullptr, 0, nullptr, nullptr, nullptr, + CommandHandle.ptr())); + ASSERT_NE(CommandHandle, nullptr); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + ASSERT_NO_FATAL_FAILURE(FillUSM2DKernel->validate()); + + size_t newGlobalWorkSize = + FillUSM2DKernel->GlobalSize[0] * FillUSM2DKernel->GlobalSize[1]; + size_t newGlobalWorkOffset = 0; + + // Since the fill2D kernel relies on the globalID, it will still work if we + // change the work dimensions to 1. + FillUSM2DKernel->UpdateDesc.newWorkDim = 1; + FillUSM2DKernel->UpdateDesc.pNewGlobalWorkSize = &newGlobalWorkSize; + FillUSM2DKernel->UpdateDesc.pNewGlobalWorkOffset = &newGlobalWorkOffset; + ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp( + CommandHandle, &FillUSM2DKernel->UpdateDesc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + ASSERT_NO_FATAL_FAILURE(FillUSM2DKernel->validate()); +} + +// Test that updating only the local work size does not cause an error. +TEST_P(urCommandBufferValidUpdateParametersTest, UpdateOnlyLocalWorkSize) { + + std::vector KernelAlternatives = { + FillUSM2DKernel->Kernel}; + + uur::raii::CommandBufferCommand CommandHandle; + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, SaxpyKernel->Kernel, SaxpyKernel->NDimensions, + &(SaxpyKernel->GlobalOffset), &(SaxpyKernel->GlobalSize), + &(SaxpyKernel->LocalSize), KernelAlternatives.size(), + KernelAlternatives.data(), 0, nullptr, 0, nullptr, nullptr, nullptr, + CommandHandle.ptr())); + ASSERT_NE(CommandHandle, nullptr); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + SaxpyKernel->UpdateDesc.pNewGlobalWorkOffset = nullptr; + SaxpyKernel->UpdateDesc.pNewGlobalWorkSize = nullptr; + size_t newLocalSize = SaxpyKernel->LocalSize * 4; + SaxpyKernel->UpdateDesc.pNewLocalWorkSize = &newLocalSize; + ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp( + CommandHandle, &SaxpyKernel->UpdateDesc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + ASSERT_NO_FATAL_FAILURE(SaxpyKernel->validate()); +} diff --git a/test/conformance/exp_command_buffer/ndrange_update.cpp b/test/conformance/exp_command_buffer/update/ndrange_update.cpp similarity index 56% rename from test/conformance/exp_command_buffer/ndrange_update.cpp rename to test/conformance/exp_command_buffer/update/ndrange_update.cpp index 3c053fe4b9..31f8d46d39 100644 --- a/test/conformance/exp_command_buffer/ndrange_update.cpp +++ b/test/conformance/exp_command_buffer/update/ndrange_update.cpp @@ -3,7 +3,8 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include "fixtures.h" +#include "../fixtures.h" +#include #include // Test that updating a command-buffer with a single kernel command @@ -29,30 +30,22 @@ struct NDRangeUpdateTest std::memset(shared_ptr, 0, allocation_size); ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, shared_ptr)); - - // Add a 3 dimension kernel command to command-buffer and close - // command-buffer - ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( - updatable_cmd_buf_handle, kernel, n_dimensions, - global_offset.data(), global_size.data(), local_size.data(), 0, - nullptr, nullptr, &command_handle)); - ASSERT_NE(command_handle, nullptr); - - ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); } // For each work-item the kernel prints the global id and local id in each // of the 3 dimensions to an offset in the output based on global linear // id. void Validate(std::array global_size, - std::array local_size, + std::optional> local_size, std::array global_offset) { + // DPC++ swaps the X & Z dimension for 3 Dimensional kernels // between those set by user and SPIR-V builtins. // See `ReverseRangeDimensionsForKernel()` in commands.cpp - std::swap(global_size[0], global_size[2]); - std::swap(local_size[0], local_size[2]); + if (local_size.has_value()) { + std::swap(local_size.value()[0], local_size.value()[2]); + } std::swap(global_offset[0], global_offset[2]); // Verify global ID and local ID of each work item @@ -73,13 +66,15 @@ struct NDRangeUpdateTest EXPECT_EQ(global_id_y, y + global_offset[1]); EXPECT_EQ(global_id_z, z + global_offset[2]); - const int local_id_x = wi_ptr[3]; - const int local_id_y = wi_ptr[4]; - const int local_id_z = wi_ptr[5]; + if (local_size.has_value()) { + const int local_id_x = wi_ptr[3]; + const int local_id_y = wi_ptr[4]; + const int local_id_z = wi_ptr[5]; - EXPECT_EQ(local_id_x, x % local_size[0]); - EXPECT_EQ(local_id_y, y % local_size[1]); - EXPECT_EQ(local_id_z, z % local_size[2]); + EXPECT_EQ(local_id_x, x % local_size.value()[0]); + EXPECT_EQ(local_id_y, y % local_size.value()[1]); + EXPECT_EQ(local_id_z, z % local_size.value()[2]); + } } } } @@ -99,7 +94,7 @@ struct NDRangeUpdateTest } static constexpr size_t elements_per_id = 6; - static constexpr size_t n_dimensions = 3; + static constexpr uint32_t n_dimensions = 3; static constexpr std::array global_size = {8, 8, 8}; static constexpr std::array local_size = {1, 2, 2}; static constexpr std::array global_offset = {0, 4, 4}; @@ -112,10 +107,17 @@ struct NDRangeUpdateTest UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(NDRangeUpdateTest); -// Keep the kernel work dimensions as 3, and update local size and global -// offset. +// Add a 3 dimension kernel command to the command-buffer and update the +// local size and global offset TEST_P(NDRangeUpdateTest, Update3D) { - // Run command-buffer prior to update an verify output + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, n_dimensions, global_offset.data(), + global_size.data(), local_size.data(), 0, nullptr, 0, nullptr, 0, + nullptr, nullptr, nullptr, &command_handle)); + ASSERT_NE(command_handle, nullptr); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + + // Run command-buffer prior to update and verify output ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); @@ -128,10 +130,11 @@ TEST_P(NDRangeUpdateTest, Update3D) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 0, // numNewPointerArgs 0, // numNewValueArgs - 3, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList nullptr, // pNewPointerArgList nullptr, // pNewValueArgList @@ -151,9 +154,17 @@ TEST_P(NDRangeUpdateTest, Update3D) { Validate(new_global_size, new_local_size, new_global_offset); } -// Update the kernel work dimensions to use 1 in the Z dimension, -// and update global size, local size, and global offset to new values. +// Add a 3 dimension kernel command to the command-buffer. Update the kernel +// work dimensions to be 1 in the Z dimension, and update global size, local +// size, and global offset to new values. TEST_P(NDRangeUpdateTest, Update2D) { + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, n_dimensions, global_offset.data(), + global_size.data(), local_size.data(), 0, nullptr, 0, nullptr, 0, + nullptr, nullptr, nullptr, &command_handle)); + ASSERT_NE(command_handle, nullptr); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + // Run command-buffer prior to update an verify output ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, nullptr, nullptr)); @@ -172,10 +183,11 @@ TEST_P(NDRangeUpdateTest, Update2D) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 0, // numNewPointerArgs 0, // numNewValueArgs - 3, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList nullptr, // pNewPointerArgList nullptr, // pNewValueArgList @@ -199,10 +211,18 @@ TEST_P(NDRangeUpdateTest, Update2D) { Validate(new_global_size, new_local_size, new_global_offset); } -// Update the kernel work dimensions to be 1 in Y & Z dimensions, and check -// that the previously set global size, local size, and global offset update +// Add a 3 dimension kernel command to the command-buffer. Update the kernel +// work dimensions to be 1 in the Y & Z dimensions, and check that the +// previously set global size, local size, and global offset update // accordingly. TEST_P(NDRangeUpdateTest, Update1D) { + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, n_dimensions, global_offset.data(), + global_size.data(), local_size.data(), 0, nullptr, 0, nullptr, 0, + nullptr, nullptr, nullptr, &command_handle)); + ASSERT_NE(command_handle, nullptr); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + // Run command-buffer prior to update an verify output ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, nullptr, nullptr)); @@ -216,10 +236,11 @@ TEST_P(NDRangeUpdateTest, Update1D) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 0, // numNewPointerArgs 0, // numNewValueArgs - 3, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList nullptr, // pNewPointerArgList nullptr, // pNewValueArgList @@ -243,26 +264,108 @@ TEST_P(NDRangeUpdateTest, Update1D) { Validate(new_global_size, new_local_size, new_global_offset); } -// Test error code is returned if work dimension parameter changes -TEST_P(NDRangeUpdateTest, Invalid) { - const size_t new_work_dim = n_dimensions - 1; +// Test that setting `pNewLocalWorkSize` to a non-NULL value when the command +// was created with a NULL local work size works. +TEST_P(NDRangeUpdateTest, ImplToUserDefinedLocalSize) { + + // Append a kernel node without setting the local work-size. + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, n_dimensions, global_offset.data(), + global_size.data(), nullptr, 0, nullptr, 0, nullptr, 0, nullptr, + nullptr, nullptr, &command_handle)); + ASSERT_NE(command_handle, nullptr); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + + // Run command-buffer prior to update an verify output + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + // Can't validate the local size because it is generated by the + // implementation. + Validate(global_size, std::nullopt, global_offset); + + // Set local size and global offset to update to + std::array new_local_size = {4, 2, 2}; + std::array new_global_offset = {3, 2, 1}; + std::array new_global_size = global_size; + + // Set a user-defined local work-size in the update desc. ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext - 0, // numNewMemObjArgs - 0, // numNewPointerArgs - 0, // numNewValueArgs - new_work_dim, // newWorkDim - nullptr, // pNewMemObjArgList - nullptr, // pNewPointerArgList - nullptr, // pNewValueArgList - nullptr, // pNewGlobalWorkOffset - nullptr, // pNewGlobalWorkSize - nullptr, // pNewLocalWorkSize + kernel, // hNewKernel + 0, // numNewMemObjArgs + 0, // numNewPointerArgs + 0, // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + nullptr, // pNewPointerArgList + nullptr, // pNewValueArgList + new_global_offset.data(), // pNewGlobalWorkOffset + new_global_size.data(), // pNewGlobalWorkSize + new_local_size.data(), // pNewLocalWorkSize }; - // Update command to command-buffer to use different work dim - ur_result_t result = - urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc); - ASSERT_EQ(UR_RESULT_ERROR_INVALID_OPERATION, result); + // Update kernel and enqueue command-buffer again + ASSERT_SUCCESS( + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + // Verify that the user defined local work-size was set correctly. + Validate(new_global_size, new_local_size, new_global_offset); +} + +// Test that setting `pNewLocalWorkSize` to a NULL value when the command was +// created with a non-NULL local work size works. +TEST_P(NDRangeUpdateTest, UserToImplDefinedLocalSize) { + + // Append a kernel node and set a user defined local work-size. + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, n_dimensions, global_offset.data(), + global_size.data(), local_size.data(), 0, nullptr, 0, nullptr, 0, + nullptr, nullptr, nullptr, &command_handle)); + ASSERT_NE(command_handle, nullptr); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + + // Run command-buffer prior to update and verify output + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + Validate(global_size, local_size, global_offset); + + // Set local size and global offset to update to + std::array new_global_offset = {3, 2, 1}; + std::array new_global_size = global_size; + + // Do not set a local-work size in the update desc to let the implementation + // decide which local-work size should be used. + ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + kernel, // hNewKernel + 0, // numNewMemObjArgs + 0, // numNewPointerArgs + 0, // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + nullptr, // pNewPointerArgList + nullptr, // pNewValueArgList + new_global_offset.data(), // pNewGlobalWorkOffset + new_global_size.data(), // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize + }; + + // Update kernel and enqueue command-buffer again + ASSERT_SUCCESS( + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + // Verify that the kernel ran successfully and the global size is the + // expected. Cannot check the local size since it's implementation defined. + Validate(new_global_size, std::nullopt, new_global_offset); } diff --git a/test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp b/test/conformance/exp_command_buffer/update/usm_fill_kernel_update.cpp similarity index 93% rename from test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp rename to test/conformance/exp_command_buffer/update/usm_fill_kernel_update.cpp index 606744cd86..85e6beccf9 100644 --- a/test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp +++ b/test/conformance/exp_command_buffer/update/usm_fill_kernel_update.cpp @@ -3,7 +3,8 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include "fixtures.h" +#include "../fixtures.h" +#include #include // Test that updating a command-buffer with a single kernel command @@ -37,7 +38,8 @@ struct USMFillCommandTest // Append kernel command to command-buffer and close command-buffer ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &global_size, &local_size, 0, nullptr, nullptr, &command_handle)); + &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr, + nullptr, nullptr, &command_handle)); ASSERT_NE(command_handle, nullptr); ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); @@ -70,7 +72,7 @@ struct USMFillCommandTest static constexpr size_t local_size = 4; static constexpr size_t global_size = 32; static constexpr size_t global_offset = 0; - static constexpr size_t n_dimensions = 1; + static constexpr uint32_t n_dimensions = 1; static constexpr size_t allocation_size = sizeof(val) * global_size; void *shared_ptr = nullptr; void *new_shared_ptr = nullptr; @@ -119,16 +121,17 @@ TEST_P(USMFillCommandTest, UpdateParameters) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext - 0, // numNewMemObjArgs - 1, // numNewPointerArgs - 1, // numNewValueArgs - static_cast(n_dimensions), // newWorkDim - nullptr, // pNewMemObjArgList - &new_output_desc, // pNewPointerArgList - &new_input_desc, // pNewValueArgList - nullptr, // pNewGlobalWorkOffset - &new_global_size, // pNewGlobalWorkSize - &new_local_size, // pNewLocalWorkSize + kernel, // hNewKernel + 0, // numNewMemObjArgs + 1, // numNewPointerArgs + 1, // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + &new_output_desc, // pNewPointerArgList + &new_input_desc, // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + &new_global_size, // pNewGlobalWorkSize + &new_local_size, // pNewLocalWorkSize }; // Update kernel and enqueue command-buffer again @@ -172,10 +175,11 @@ TEST_P(USMFillCommandTest, UpdateBeforeEnqueue) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 1, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList &new_output_desc, // pNewPointerArgList &new_input_desc, // pNewValueArgList @@ -233,8 +237,8 @@ struct USMMultipleFillCommandTest // Append kernel and store returned handle ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &elements, &local_size, 0, nullptr, nullptr, - &command_handles[k])); + &elements, &local_size, 0, nullptr, 0, nullptr, 0, nullptr, + nullptr, nullptr, &command_handles[k])); ASSERT_NE(command_handles[k], nullptr); } @@ -323,10 +327,11 @@ TEST_P(USMMultipleFillCommandTest, UpdateAllKernels) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 1, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList &new_output_desc, // pNewPointerArgList &new_input_desc, // pNewValueArgList diff --git a/test/conformance/exp_command_buffer/usm_saxpy_kernel_update.cpp b/test/conformance/exp_command_buffer/update/usm_saxpy_kernel_update.cpp similarity index 96% rename from test/conformance/exp_command_buffer/usm_saxpy_kernel_update.cpp rename to test/conformance/exp_command_buffer/update/usm_saxpy_kernel_update.cpp index 0cb50cb3f1..1dc34c00fd 100644 --- a/test/conformance/exp_command_buffer/usm_saxpy_kernel_update.cpp +++ b/test/conformance/exp_command_buffer/update/usm_saxpy_kernel_update.cpp @@ -3,7 +3,8 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include "fixtures.h" +#include "../fixtures.h" +#include #include // Test that updating a command-buffer with a single kernel command @@ -81,7 +82,8 @@ struct USMSaxpyKernelTest : USMSaxpyKernelTestBase { // Append kernel command to command-buffer and close command-buffer ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &global_size, &local_size, 0, nullptr, nullptr, &command_handle)); + &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr, + nullptr, nullptr, &command_handle)); ASSERT_NE(command_handle, nullptr); ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); @@ -147,10 +149,11 @@ TEST_P(USMSaxpyKernelTest, UpdateParameters) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 2, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList new_input_descs, // pNewPointerArgList &new_A_desc, // pNewValueArgList @@ -181,8 +184,8 @@ struct USMMultiSaxpyKernelTest : USMSaxpyKernelTestBase { for (unsigned node = 0; node < nodes; node++) { ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &global_size, &local_size, 0, nullptr, nullptr, - &command_handles[node])); + &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr, + nullptr, nullptr, &command_handles[node])); ASSERT_NE(command_handles[node], nullptr); } @@ -252,10 +255,11 @@ TEST_P(USMMultiSaxpyKernelTest, UpdateParameters) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 2, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList new_input_descs, // pNewPointerArgList &new_A_desc, // pNewValueArgList @@ -317,10 +321,11 @@ TEST_P(USMMultiSaxpyKernelTest, UpdateWithoutBlocking) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 2, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList new_input_descs, // pNewPointerArgList &new_A_desc, // pNewValueArgList diff --git a/test/conformance/exp_enqueue_native/exp_enqueue_native_adapter_cuda.match b/test/conformance/exp_enqueue_native/exp_enqueue_native_adapter_cuda.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/exp_enqueue_native/exp_enqueue_native_adapter_level_zero_v2.match b/test/conformance/exp_enqueue_native/exp_enqueue_native_adapter_level_zero_v2.match new file mode 100644 index 0000000000..d4645b3ffc --- /dev/null +++ b/test/conformance/exp_enqueue_native/exp_enqueue_native_adapter_level_zero_v2.match @@ -0,0 +1,5 @@ +{{NONDETERMINISTIC}} +urLevelZeroEnqueueNativeCommandTest.Success{{.*}} +urLevelZeroEnqueueNativeCommandTest.Dependencies{{.*}} +urLevelZeroEnqueueNativeCommandTest.DependenciesURBefore{{.*}} +urLevelZeroEnqueueNativeCommandTest.DependenciesURAfter{{.*}} diff --git a/test/conformance/exp_launch_properties/exp_launch_properties_adapter_cuda.match b/test/conformance/exp_launch_properties/exp_launch_properties_adapter_cuda.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/exp_launch_properties/exp_launch_properties_adapter_hip.match b/test/conformance/exp_launch_properties/exp_launch_properties_adapter_hip.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/exp_launch_properties/exp_launch_properties_adapter_level_zero-v2.match b/test/conformance/exp_launch_properties/exp_launch_properties_adapter_level_zero-v2.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/exp_launch_properties/exp_launch_properties_adapter_level_zero.match b/test/conformance/exp_launch_properties/exp_launch_properties_adapter_level_zero.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/exp_launch_properties/exp_launch_properties_adapter_native_cpu.match b/test/conformance/exp_launch_properties/exp_launch_properties_adapter_native_cpu.match index 2a87dd8c12..f8b1e49e44 100644 --- a/test/conformance/exp_launch_properties/exp_launch_properties_adapter_native_cpu.match +++ b/test/conformance/exp_launch_properties/exp_launch_properties_adapter_native_cpu.match @@ -1 +1,2 @@ +{{NONDETERMINISTIC}} urEnqueueKernelLaunchCustomTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} diff --git a/test/conformance/exp_launch_properties/exp_launch_properties_adapter_opencl.match b/test/conformance/exp_launch_properties/exp_launch_properties_adapter_opencl.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/exp_usm_p2p/exp_usm_p2p_adapter_cuda.match b/test/conformance/exp_usm_p2p/exp_usm_p2p_adapter_cuda.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/exp_usm_p2p/exp_usm_p2p_adapter_hip.match b/test/conformance/exp_usm_p2p/exp_usm_p2p_adapter_hip.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/exp_usm_p2p/exp_usm_p2p_adapter_level_zero-v2.match b/test/conformance/exp_usm_p2p/exp_usm_p2p_adapter_level_zero-v2.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/exp_usm_p2p/exp_usm_p2p_adapter_level_zero.match b/test/conformance/exp_usm_p2p/exp_usm_p2p_adapter_level_zero.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/exp_usm_p2p/exp_usm_p2p_adapter_native_cpu.match b/test/conformance/exp_usm_p2p/exp_usm_p2p_adapter_native_cpu.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/exp_usm_p2p/exp_usm_p2p_adapter_opencl.match b/test/conformance/exp_usm_p2p/exp_usm_p2p_adapter_opencl.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/integration/QueueEmptyStatus.cpp b/test/conformance/integration/QueueEmptyStatus.cpp index c7d15602b0..fd49ae7915 100644 --- a/test/conformance/integration/QueueEmptyStatus.cpp +++ b/test/conformance/integration/QueueEmptyStatus.cpp @@ -22,13 +22,25 @@ struct QueueEmptyStatusTestWithParam : uur::IntegrationQueueTestWithParam { GTEST_SKIP() << "Shared USM is not supported."; } + // QUEUE_INFO_EMPTY isn't supported by all adapters + ur_bool_t empty_check = false; + auto result = + urQueueGetInfo(queue, UR_QUEUE_INFO_EMPTY, sizeof(empty_check), + &empty_check, nullptr); + if (result == UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION) { + GTEST_SKIP() << "QUEUE_INFO_EMPTY is not supported."; + } + ASSERT_SUCCESS(result); + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, ArraySize * sizeof(uint32_t), &SharedMem)); } void TearDown() override { - ASSERT_SUCCESS(urUSMFree(context, SharedMem)); + if (SharedMem) { + ASSERT_SUCCESS(urUSMFree(context, SharedMem)); + } uur::IntegrationQueueTestWithParam::TearDown(); } diff --git a/test/conformance/integration/integration_adapter_cuda.match b/test/conformance/integration/integration_adapter_cuda.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/integration/integration_adapter_hip.match b/test/conformance/integration/integration_adapter_hip.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/integration/integration_adapter_level_zero.match b/test/conformance/integration/integration_adapter_level_zero.match index 905fdea60f..a49ad93a94 100644 --- a/test/conformance/integration/integration_adapter_level_zero.match +++ b/test/conformance/integration/integration_adapter_level_zero.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} {{OPT}}QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___IN_ORDER_QUEUE {{OPT}}QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___OUT_OF_ORDER_QUEUE {{OPT}}QueueUSMTestWithParam.QueueUSMTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___IN_ORDER_QUEUE diff --git a/test/conformance/integration/integration_adapter_level_zero-v2.match b/test/conformance/integration/integration_adapter_level_zero_v2.match similarity index 97% rename from test/conformance/integration/integration_adapter_level_zero-v2.match rename to test/conformance/integration/integration_adapter_level_zero_v2.match index 905fdea60f..a49ad93a94 100644 --- a/test/conformance/integration/integration_adapter_level_zero-v2.match +++ b/test/conformance/integration/integration_adapter_level_zero_v2.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} {{OPT}}QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___IN_ORDER_QUEUE {{OPT}}QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___OUT_OF_ORDER_QUEUE {{OPT}}QueueUSMTestWithParam.QueueUSMTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___IN_ORDER_QUEUE diff --git a/test/conformance/integration/integration_adapter_native_cpu.match b/test/conformance/integration/integration_adapter_native_cpu.match index d1974de779..b3f1481fa3 100644 --- a/test/conformance/integration/integration_adapter_native_cpu.match +++ b/test/conformance/integration/integration_adapter_native_cpu.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__IN_ORDER_QUEUE QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__OUT_OF_ORDER_QUEUE QueueUSMTestWithParam.QueueUSMTest/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__IN_ORDER_QUEUE diff --git a/test/conformance/integration/integration_adapter_opencl.match b/test/conformance/integration/integration_adapter_opencl.match deleted file mode 100644 index 57a5299327..0000000000 --- a/test/conformance/integration/integration_adapter_opencl.match +++ /dev/null @@ -1,6 +0,0 @@ -QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/Intel_R__OpenCL___{{.*}}___IN_ORDER_QUEUE -QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/Intel_R__OpenCL___{{.*}}___OUT_OF_ORDER_QUEUE -QueueUSMTestWithParam.QueueUSMTest/Intel_R__OpenCL___{{.*}}___IN_ORDER_QUEUE -QueueUSMTestWithParam.QueueUSMTest/Intel_R__OpenCL___{{.*}}___OUT_OF_ORDER_QUEUE -QueueBufferTestWithParam.QueueBufferTest/Intel_R__OpenCL___{{.*}}___IN_ORDER_QUEUE -QueueBufferTestWithParam.QueueBufferTest/Intel_R__OpenCL___{{.*}}___OUT_OF_ORDER_QUEUE diff --git a/test/conformance/kernel/kernel_adapter_cuda.match b/test/conformance/kernel/kernel_adapter_cuda.match index fe44a34352..b05b2fda58 100644 --- a/test/conformance/kernel/kernel_adapter_cuda.match +++ b/test/conformance/kernel/kernel_adapter_cuda.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} urKernelGetGroupInfoWgSizeTest.CompileWorkGroupSize/NVIDIA_CUDA_BACKEND___{{.*}}_ {{OPT}}urKernelSetArgLocalTest.InvalidKernelArgumentIndex/NVIDIA_CUDA_BACKEND___{{.*}}_ {{OPT}}urKernelSetArgMemObjTest.InvalidKernelArgumentIndex/NVIDIA_CUDA_BACKEND___{{.*}}_ diff --git a/test/conformance/kernel/kernel_adapter_hip.match b/test/conformance/kernel/kernel_adapter_hip.match index 2cfb81f0c6..4e6ab18293 100644 --- a/test/conformance/kernel/kernel_adapter_hip.match +++ b/test/conformance/kernel/kernel_adapter_hip.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} urKernelGetGroupInfoWgSizeTest.CompileWorkGroupSize/AMD_HIP_BACKEND___{{.*}}_ urKernelGetInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_KERNEL_INFO_NUM_REGS urKernelSetArgLocalTest.InvalidKernelArgumentIndex/AMD_HIP_BACKEND___{{.*}}_ diff --git a/test/conformance/kernel/kernel_adapter_level_zero-v2.match b/test/conformance/kernel/kernel_adapter_level_zero-v2.match deleted file mode 100644 index 77d2096d92..0000000000 --- a/test/conformance/kernel/kernel_adapter_level_zero-v2.match +++ /dev/null @@ -1,17 +0,0 @@ -urKernelGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_NUM_REGS -urKernelSetArgLocalTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgMemObjTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgPointerTest.SuccessHost/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgPointerTest.SuccessDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgPointerTest.SuccessShared/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgPointerNegativeTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgSamplerTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgValueTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgValueTest.InvalidKernelArgumentSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetExecInfoTest.SuccessIndirectAccess/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetExecInfoUSMPointersTest.SuccessHost/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetExecInfoUSMPointersTest.SuccessDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetExecInfoUSMPointersTest.SuccessShared/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelGetSuggestedLocalWorkSizeTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelGetSuggestedLocalWorkSizeTest.Success2D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelGetSuggestedLocalWorkSizeTest.Success3D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ diff --git a/test/conformance/kernel/kernel_adapter_level_zero.match b/test/conformance/kernel/kernel_adapter_level_zero.match index 7b74766ac2..cf83e73ff3 100644 --- a/test/conformance/kernel/kernel_adapter_level_zero.match +++ b/test/conformance/kernel/kernel_adapter_level_zero.match @@ -1,10 +1,4 @@ -urKernelGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_NUM_REGS -urKernelSetArgLocalTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgMemObjTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgPointerNegativeTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgSamplerTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgValueTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgValueTest.InvalidKernelArgumentSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +{{NONDETERMINISTIC}} urKernelSetExecInfoTest.SuccessIndirectAccess/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetExecInfoUSMPointersTest.SuccessHost/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetExecInfoUSMPointersTest.SuccessDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ diff --git a/test/conformance/kernel/kernel_adapter_level_zero_v2.match b/test/conformance/kernel/kernel_adapter_level_zero_v2.match new file mode 100644 index 0000000000..8f431b3617 --- /dev/null +++ b/test/conformance/kernel/kernel_adapter_level_zero_v2.match @@ -0,0 +1,64 @@ +{{NONDETERMINISTIC}} +urKernelGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_FUNCTION_NAME +urKernelGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_NUM_ARGS +urKernelGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_REFERENCE_COUNT +urKernelGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_CONTEXT +urKernelGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_PROGRAM +urKernelGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_ATTRIBUTES +urKernelGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_NUM_REGS +urKernelGetInfoTest.InvalidSizeZero/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_FUNCTION_NAME +urKernelGetInfoTest.InvalidSizeZero/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_NUM_ARGS +urKernelGetInfoTest.InvalidSizeZero/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_REFERENCE_COUNT +urKernelGetInfoTest.InvalidSizeZero/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_CONTEXT +urKernelGetInfoTest.InvalidSizeZero/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_PROGRAM +urKernelGetInfoTest.InvalidSizeZero/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_ATTRIBUTES +urKernelGetInfoTest.InvalidSizeZero/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_NUM_REGS +urKernelGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_FUNCTION_NAME +urKernelGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_NUM_ARGS +urKernelGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_REFERENCE_COUNT +urKernelGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_CONTEXT +urKernelGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_PROGRAM +urKernelGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_ATTRIBUTES +urKernelGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_NUM_REGS +urKernelGetInfoTest.InvalidNullPointerPropValue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_FUNCTION_NAME +urKernelGetInfoTest.InvalidNullPointerPropValue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_NUM_ARGS +urKernelGetInfoTest.InvalidNullPointerPropValue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_REFERENCE_COUNT +urKernelGetInfoTest.InvalidNullPointerPropValue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_CONTEXT +urKernelGetInfoTest.InvalidNullPointerPropValue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_PROGRAM +urKernelGetInfoTest.InvalidNullPointerPropValue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_ATTRIBUTES +urKernelGetInfoTest.InvalidNullPointerPropValue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_NUM_REGS +urKernelGetInfoSingleTest.KernelNameCorrect/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urKernelGetInfoSingleTest.KernelContextCorrect/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urKernelSetArgLocalTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urKernelSetArgMemObjTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urKernelSetArgPointerNegativeTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urKernelSetArgSamplerTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___NORMALIZED_UR_SAMPLER_ADDRESSING_MODE_NONE_UR_SAMPLER_FILTER_MODE_NEAREST +urKernelSetArgSamplerTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___NORMALIZED_UR_SAMPLER_ADDRESSING_MODE_NONE_UR_SAMPLER_FILTER_MODE_LINEAR +urKernelSetArgSamplerTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___NORMALIZED_UR_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE_UR_SAMPLER_FILTER_MODE_NEAREST +urKernelSetArgSamplerTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___NORMALIZED_UR_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE_UR_SAMPLER_FILTER_MODE_LINEAR +urKernelSetArgSamplerTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___NORMALIZED_UR_SAMPLER_ADDRESSING_MODE_CLAMP_UR_SAMPLER_FILTER_MODE_NEAREST +urKernelSetArgSamplerTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___NORMALIZED_UR_SAMPLER_ADDRESSING_MODE_CLAMP_UR_SAMPLER_FILTER_MODE_LINEAR +urKernelSetArgSamplerTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___NORMALIZED_UR_SAMPLER_ADDRESSING_MODE_REPEAT_UR_SAMPLER_FILTER_MODE_NEAREST +urKernelSetArgSamplerTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___NORMALIZED_UR_SAMPLER_ADDRESSING_MODE_REPEAT_UR_SAMPLER_FILTER_MODE_LINEAR +urKernelSetArgSamplerTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___NORMALIZED_UR_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT_UR_SAMPLER_FILTER_MODE_NEAREST +urKernelSetArgSamplerTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___NORMALIZED_UR_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT_UR_SAMPLER_FILTER_MODE_LINEAR +urKernelSetArgSamplerTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UNNORMALIZED_UR_SAMPLER_ADDRESSING_MODE_NONE_UR_SAMPLER_FILTER_MODE_NEAREST +urKernelSetArgSamplerTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UNNORMALIZED_UR_SAMPLER_ADDRESSING_MODE_NONE_UR_SAMPLER_FILTER_MODE_LINEAR +urKernelSetArgSamplerTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UNNORMALIZED_UR_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE_UR_SAMPLER_FILTER_MODE_NEAREST +urKernelSetArgSamplerTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UNNORMALIZED_UR_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE_UR_SAMPLER_FILTER_MODE_LINEAR +urKernelSetArgSamplerTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UNNORMALIZED_UR_SAMPLER_ADDRESSING_MODE_CLAMP_UR_SAMPLER_FILTER_MODE_NEAREST +urKernelSetArgSamplerTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UNNORMALIZED_UR_SAMPLER_ADDRESSING_MODE_CLAMP_UR_SAMPLER_FILTER_MODE_LINEAR +urKernelSetArgSamplerTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UNNORMALIZED_UR_SAMPLER_ADDRESSING_MODE_REPEAT_UR_SAMPLER_FILTER_MODE_NEAREST +urKernelSetArgSamplerTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UNNORMALIZED_UR_SAMPLER_ADDRESSING_MODE_REPEAT_UR_SAMPLER_FILTER_MODE_LINEAR +urKernelSetArgSamplerTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UNNORMALIZED_UR_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT_UR_SAMPLER_FILTER_MODE_NEAREST +urKernelSetArgSamplerTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UNNORMALIZED_UR_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT_UR_SAMPLER_FILTER_MODE_LINEAR +urKernelSetArgSamplerTest.SuccessWithProps/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urKernelSetArgSamplerTest.InvalidNullHandleKernel/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urKernelSetArgSamplerTest.InvalidNullHandleArgValue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urKernelSetArgSamplerTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urKernelSetArgValueTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urKernelSetArgValueTest.InvalidKernelArgumentSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urKernelSetExecInfoTest.SuccessIndirectAccess/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urKernelSetExecInfoUSMPointersTest.SuccessHost/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urKernelSetExecInfoUSMPointersTest.SuccessDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urKernelSetExecInfoUSMPointersTest.SuccessShared/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ diff --git a/test/conformance/kernel/kernel_adapter_native_cpu.match b/test/conformance/kernel/kernel_adapter_native_cpu.match index 4d3b506fcf..368f4ad358 100644 --- a/test/conformance/kernel/kernel_adapter_native_cpu.match +++ b/test/conformance/kernel/kernel_adapter_native_cpu.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} urKernelCreateTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} urKernelCreateTest.InvalidNullHandleProgram/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} urKernelCreateTest.InvalidNullPointerName/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} @@ -12,25 +13,34 @@ urKernelGetGroupInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_K urKernelGetGroupInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE urKernelGetGroupInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE urKernelGetGroupInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE +urKernelGetGroupInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE +urKernelGetGroupInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE +urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE +urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE +urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE +urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE +urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE +urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE urKernelGetGroupInfoSingleTest.CompileWorkGroupSizeEmpty/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +urKernelGetGroupInfoSingleTest.CompileMaxWorkGroupSizeEmpty/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} urKernelGetGroupInfoWgSizeTest.CompileWorkGroupSize/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} urKernelGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_INFO_FUNCTION_NAME urKernelGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_INFO_NUM_ARGS @@ -104,6 +114,7 @@ urKernelGetSubGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU urKernelGetSubGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL urKernelGetSubGroupInfoSingleTest.CompileNumSubgroupsIsZero/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} urKernelReleaseTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +urKernelReleaseTest.KernelReleaseAfterProgramRelease/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} urKernelReleaseTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} urKernelRetainTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} urKernelRetainTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} @@ -161,6 +172,10 @@ urKernelSetSpecializationConstantsTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU urKernelSetSpecializationConstantsTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} urKernelSetSpecializationConstantsTest.InvalidNullPointerSpecConstants/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} urKernelSetSpecializationConstantsTest.InvalidSizeCount/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +urKernelSetSpecializationConstantsTest.InvalidValueSize/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +urKernelSetSpecializationConstantsTest.InvalidValueId/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +urKernelSetSpecializationConstantsTest.InvalidValuePtr/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +urKernelSetSpecializationConstantsNegativeTest.Unsupported/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} urKernelGetSuggestedLocalWorkSizeTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} urKernelGetSuggestedLocalWorkSizeTest.Success2D/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} urKernelGetSuggestedLocalWorkSizeTest.Success3D/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} diff --git a/test/conformance/kernel/kernel_adapter_opencl.match b/test/conformance/kernel/kernel_adapter_opencl.match index dfc23cf5ee..d65c8e51c8 100644 --- a/test/conformance/kernel/kernel_adapter_opencl.match +++ b/test/conformance/kernel/kernel_adapter_opencl.match @@ -1 +1,2 @@ +{{NONDETERMINISTIC}} urKernelGetInfoTest.Success/Intel_R__OpenCL_{{.*}}_UR_KERNEL_INFO_NUM_REGS diff --git a/test/conformance/kernel/urKernelGetGroupInfo.cpp b/test/conformance/kernel/urKernelGetGroupInfo.cpp index b91001a07f..2b3c70c22e 100644 --- a/test/conformance/kernel/urKernelGetGroupInfo.cpp +++ b/test/conformance/kernel/urKernelGetGroupInfo.cpp @@ -3,6 +3,7 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include #include using urKernelGetGroupInfoTest = @@ -15,7 +16,9 @@ UUR_TEST_SUITE_P( UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE, UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, - UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE), + UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE, + UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE, + UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE), uur::deviceTestWithParamPrinter); struct urKernelGetGroupInfoSingleTest : uur::urKernelTest { @@ -85,7 +88,7 @@ TEST_P(urKernelGetGroupInfoWgSizeTest, CompileWorkGroupSize) { } TEST_P(urKernelGetGroupInfoSingleTest, CompileWorkGroupSizeEmpty) { - // Returns 0 by default when there is no sepecific information + // Returns 0 by default when there is no specific information std::array read_dims{1, 1, 1}; std::array zero{0, 0, 0}; ASSERT_SUCCESS(urKernelGetGroupInfo( @@ -93,3 +96,17 @@ TEST_P(urKernelGetGroupInfoSingleTest, CompileWorkGroupSizeEmpty) { sizeof(read_dims), read_dims.data(), nullptr)); ASSERT_EQ(read_dims, zero); } + +TEST_P(urKernelGetGroupInfoSingleTest, CompileMaxWorkGroupSizeEmpty) { + // Returns 0 by default when there is no specific information + std::array read_dims{1, 1, 1}; + std::array zero{0, 0, 0}; + auto result = urKernelGetGroupInfo( + kernel, device, UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE, + sizeof(read_dims), read_dims.data(), nullptr); + if (result == UR_RESULT_SUCCESS) { + ASSERT_EQ(read_dims, zero); + } else { + ASSERT_EQ(result, UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION); + } +} diff --git a/test/conformance/kernel/urKernelRelease.cpp b/test/conformance/kernel/urKernelRelease.cpp index 051c9d4954..3e2078d98c 100644 --- a/test/conformance/kernel/urKernelRelease.cpp +++ b/test/conformance/kernel/urKernelRelease.cpp @@ -13,6 +13,13 @@ TEST_P(urKernelReleaseTest, Success) { ASSERT_SUCCESS(urKernelRelease(kernel)); } +TEST_P(urKernelReleaseTest, KernelReleaseAfterProgramRelease) { + ASSERT_SUCCESS(urKernelRetain(kernel)); + ASSERT_SUCCESS(urProgramRelease(program)); + program = nullptr; + ASSERT_SUCCESS(urKernelRelease(kernel)); +} + TEST_P(urKernelReleaseTest, InvalidNullHandleKernel) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE, urKernelRelease(nullptr)); diff --git a/test/conformance/kernel/urKernelSetSpecializationConstants.cpp b/test/conformance/kernel/urKernelSetSpecializationConstants.cpp index 665a20de4a..e12df68db0 100644 --- a/test/conformance/kernel/urKernelSetSpecializationConstants.cpp +++ b/test/conformance/kernel/urKernelSetSpecializationConstants.cpp @@ -27,6 +27,28 @@ struct urKernelSetSpecializationConstantsTest : uur::urBaseKernelExecutionTest { }; UUR_INSTANTIATE_KERNEL_TEST_SUITE_P(urKernelSetSpecializationConstantsTest); +struct urKernelSetSpecializationConstantsNegativeTest + : uur::urBaseKernelExecutionTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelExecutionTest::SetUp()); + bool supports_kernel_spec_constant = false; + ASSERT_SUCCESS(urDeviceGetInfo( + device, UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS, + sizeof(supports_kernel_spec_constant), + &supports_kernel_spec_constant, nullptr)); + if (supports_kernel_spec_constant) { + GTEST_SKIP() << "Device supports setting kernel spec constants."; + } + Build(); + } + + uint32_t spec_value = 42; + ur_specialization_constant_info_t info = {0, sizeof(spec_value), + &spec_value}; +}; +UUR_INSTANTIATE_KERNEL_TEST_SUITE_P( + urKernelSetSpecializationConstantsNegativeTest); + TEST_P(urKernelSetSpecializationConstantsTest, Success) { ASSERT_SUCCESS(urKernelSetSpecializationConstants(kernel, 1, &info)); @@ -36,6 +58,11 @@ TEST_P(urKernelSetSpecializationConstantsTest, Success) { ValidateBuffer(buffer, sizeof(spec_value), spec_value); } +TEST_P(urKernelSetSpecializationConstantsNegativeTest, Unsupported) { + ASSERT_EQ_RESULT(UR_RESULT_ERROR_UNSUPPORTED_FEATURE, + urKernelSetSpecializationConstants(kernel, 1, &info)); +} + TEST_P(urKernelSetSpecializationConstantsTest, InvalidNullHandleKernel) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE, urKernelSetSpecializationConstants(nullptr, 1, &info)); @@ -51,3 +78,23 @@ TEST_P(urKernelSetSpecializationConstantsTest, InvalidSizeCount) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_SIZE, urKernelSetSpecializationConstants(kernel, 0, &info)); } + +TEST_P(urKernelSetSpecializationConstantsTest, InvalidValueSize) { + ur_specialization_constant_info_t bad_info = {0, 0x1000, &spec_value}; + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_VALUE, + urKernelSetSpecializationConstants(kernel, 1, &bad_info)); +} + +TEST_P(urKernelSetSpecializationConstantsTest, InvalidValueId) { + ur_specialization_constant_info_t bad_info = {999, sizeof(spec_value), + &spec_value}; + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_SPEC_ID, + urKernelSetSpecializationConstants(kernel, 1, &bad_info)); +} + +TEST_P(urKernelSetSpecializationConstantsTest, InvalidValuePtr) { + ur_specialization_constant_info_t bad_info = {0, sizeof(spec_value), + nullptr}; + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_VALUE, + urKernelSetSpecializationConstants(kernel, 1, &bad_info)); +} diff --git a/test/conformance/memory-migrate/memory-migrate_adapter_cuda.match b/test/conformance/memory-migrate/memory-migrate_adapter_cuda.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/memory-migrate/memory-migrate_adapter_hip.match b/test/conformance/memory-migrate/memory-migrate_adapter_hip.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/memory-migrate/memory-migrate_adapter_level_zero-v2.match b/test/conformance/memory-migrate/memory-migrate_adapter_level_zero-v2.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/memory-migrate/memory-migrate_adapter_level_zero.match b/test/conformance/memory-migrate/memory-migrate_adapter_level_zero.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/memory-migrate/memory-migrate_adapter_native_cpu.match b/test/conformance/memory-migrate/memory-migrate_adapter_native_cpu.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/memory-migrate/memory-migrate_adapter_opencl.match b/test/conformance/memory-migrate/memory-migrate_adapter_opencl.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/memory-migrate/urMemBufferMigrateAcrossDevices.cpp b/test/conformance/memory-migrate/urMemBufferMigrateAcrossDevices.cpp index 2e8856ac97..f7617a2940 100644 --- a/test/conformance/memory-migrate/urMemBufferMigrateAcrossDevices.cpp +++ b/test/conformance/memory-migrate/urMemBufferMigrateAcrossDevices.cpp @@ -165,6 +165,9 @@ TEST_F(urMultiDeviceContextMemBufferTest, WriteRead) { ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, false, 0, buffer_size_bytes, out_vec.data(), 1, &e1, nullptr)); + + ASSERT_SUCCESS(urQueueFinish(queues[1])); + for (auto &a : out_vec) { ASSERT_EQ(a, fill_val); } @@ -186,6 +189,9 @@ TEST_F(urMultiDeviceContextMemBufferTest, FillRead) { ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, false, 0, buffer_size_bytes, out_vec.data(), 1, &e1, nullptr)); + + ASSERT_SUCCESS(urQueueFinish(queues[1])); + for (auto &a : out_vec) { ASSERT_EQ(a, fill_val); } @@ -219,6 +225,9 @@ TEST_F(urMultiDeviceContextMemBufferTest, WriteKernelRead) { ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[0], buffer, false, 0, buffer_size_bytes, out_vec.data(), 1, &e2, nullptr)); + + ASSERT_SUCCESS(urQueueFinish(queues[0])); + for (auto &a : out_vec) { ASSERT_EQ(a, fill_val + 1); } @@ -257,6 +266,9 @@ TEST_F(urMultiDeviceContextMemBufferTest, WriteKernelKernelRead) { ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, false, 0, buffer_size_bytes, out_vec.data(), 1, &e3, nullptr)); + + ASSERT_SUCCESS(urQueueFinish(queues[1])); + for (auto &a : out_vec) { ASSERT_EQ(a, fill_val + 2); } diff --git a/test/conformance/memory/memory_adapter_cuda.match b/test/conformance/memory/memory_adapter_cuda.match index 7d2e6a1c01..c5b70e8559 100644 --- a/test/conformance/memory/memory_adapter_cuda.match +++ b/test/conformance/memory/memory_adapter_cuda.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} urMemImageCreateTest.InvalidSize/NVIDIA_CUDA_BACKEND___{{.*}}_ {{OPT}}urMemImageCremBufferCrateTestWith1DMemoryTypeParam.Success/NVIDIA_CUDA_BACKEND___{{.*}}___UR_MEM_TYPE_IMAGE1D_ARRAY {{OPT}}urMemImageCreateTestWith2DMemoryTypeParam.Success/NVIDIA_CUDA_BACKEND___{{.*}}___UR_MEM_TYPE_IMAGE2D_ARRAY diff --git a/test/conformance/memory/memory_adapter_hip.match b/test/conformance/memory/memory_adapter_hip.match index a4e1b131f0..589542df7f 100644 --- a/test/conformance/memory/memory_adapter_hip.match +++ b/test/conformance/memory/memory_adapter_hip.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} urMemImageCreateTest.InvalidSize/AMD_HIP_BACKEND___{{.*}} urMemImageGetInfoTest.Success/AMD_HIP_BACKEND___{{.*}} urMemImageGetInfoTest.Success/AMD_HIP_BACKEND___{{.*}} diff --git a/test/conformance/memory/memory_adapter_level_zero-v2.match b/test/conformance/memory/memory_adapter_level_zero-v2.match deleted file mode 100644 index 369bc5e727..0000000000 --- a/test/conformance/memory/memory_adapter_level_zero-v2.match +++ /dev/null @@ -1,4 +0,0 @@ -urMemBufferPartitionTest.InvalidValueCreateType/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urMemBufferPartitionTest.InvalidValueBufferCreateInfoOutOfBounds/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -{{OPT}}urMemGetInfoImageTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_MEM_INFO_SIZE -{{Segmentation fault|Aborted}} diff --git a/test/conformance/memory/memory_adapter_level_zero.match b/test/conformance/memory/memory_adapter_level_zero.match index 369bc5e727..b2b72be06b 100644 --- a/test/conformance/memory/memory_adapter_level_zero.match +++ b/test/conformance/memory/memory_adapter_level_zero.match @@ -1,4 +1,17 @@ +{{NONDETERMINISTIC}} urMemBufferPartitionTest.InvalidValueCreateType/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urMemBufferPartitionTest.InvalidValueBufferCreateInfoOutOfBounds/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ {{OPT}}urMemGetInfoImageTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_MEM_INFO_SIZE -{{Segmentation fault|Aborted}} +{{OPT}}{{Segmentation fault|Aborted}} +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_FLOAT \ No newline at end of file diff --git a/test/conformance/memory/memory_adapter_level_zero_v2.match b/test/conformance/memory/memory_adapter_level_zero_v2.match new file mode 100644 index 0000000000..284dcab4b0 --- /dev/null +++ b/test/conformance/memory/memory_adapter_level_zero_v2.match @@ -0,0 +1,280 @@ +{{NONDETERMINISTIC}} +urMemBufferPartitionTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urMemBufferPartitionTest.InvalidValueCreateType/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urMemBufferPartitionTest.InvalidValueBufferCreateInfoOutOfBounds/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +{{OPT}}urMemGetInfoImageTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_MEM_INFO_SIZE +{{OPT}}urMemGetInfoImageTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_MEM_INFO_CONTEXT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_INT_101010 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_INT_101010 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_INT_101010 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_INT_101010 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_INT_101010 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_INT_101010 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_INT_101010 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_INT_101010 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_INT_101010 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_INT_101010 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_INT_101010 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_INT_101010 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_INT_101010 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_INT_101010 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_INT_101010 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +{{OPT}}urMemImageCreateTestWithImageFormatParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_FLOAT +{{OPT}}urMemImageGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_FORMAT +{{OPT}}urMemImageGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_ELEMENT_SIZE +{{OPT}}urMemImageGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_ROW_PITCH +{{OPT}}urMemImageGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_SLICE_PITCH +{{OPT}}urMemImageGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_WIDTH +{{OPT}}urMemImageGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_HEIGHT +{{OPT}}urMemImageGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_DEPTH +{{OPT}}urMemImageGetInfoTest.InvalidNullHandleImage/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_FORMAT +{{OPT}}urMemImageGetInfoTest.InvalidNullHandleImage/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_ELEMENT_SIZE +{{OPT}}urMemImageGetInfoTest.InvalidNullHandleImage/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_ROW_PITCH +{{OPT}}urMemImageGetInfoTest.InvalidNullHandleImage/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_SLICE_PITCH +{{OPT}}urMemImageGetInfoTest.InvalidNullHandleImage/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_WIDTH +{{OPT}}urMemImageGetInfoTest.InvalidNullHandleImage/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_HEIGHT +{{OPT}}urMemImageGetInfoTest.InvalidNullHandleImage/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_DEPTH +{{OPT}}urMemImageGetInfoTest.InvalidEnumerationImageInfoType/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_FORMAT +{{OPT}}urMemImageGetInfoTest.InvalidEnumerationImageInfoType/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_ELEMENT_SIZE +{{OPT}}urMemImageGetInfoTest.InvalidEnumerationImageInfoType/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_ROW_PITCH +{{OPT}}urMemImageGetInfoTest.InvalidEnumerationImageInfoType/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_SLICE_PITCH +{{OPT}}urMemImageGetInfoTest.InvalidEnumerationImageInfoType/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_WIDTH +{{OPT}}urMemImageGetInfoTest.InvalidEnumerationImageInfoType/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_HEIGHT +{{OPT}}urMemImageGetInfoTest.InvalidEnumerationImageInfoType/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_DEPTH +{{OPT}}urMemImageGetInfoTest.InvalidSizeZero/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_FORMAT +{{OPT}}urMemImageGetInfoTest.InvalidSizeZero/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_ELEMENT_SIZE +{{OPT}}urMemImageGetInfoTest.InvalidSizeZero/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_ROW_PITCH +{{OPT}}urMemImageGetInfoTest.InvalidSizeZero/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_SLICE_PITCH +{{OPT}}urMemImageGetInfoTest.InvalidSizeZero/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_WIDTH +{{OPT}}urMemImageGetInfoTest.InvalidSizeZero/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_HEIGHT +{{OPT}}urMemImageGetInfoTest.InvalidSizeZero/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_DEPTH +{{OPT}}urMemImageGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_FORMAT +{{OPT}}urMemImageGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_ELEMENT_SIZE +{{OPT}}urMemImageGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_ROW_PITCH +{{OPT}}urMemImageGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_SLICE_PITCH +{{OPT}}urMemImageGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_WIDTH +{{OPT}}urMemImageGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_HEIGHT +{{OPT}}urMemImageGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_DEPTH +{{OPT}}urMemImageGetInfoTest.InvalidNullPointerParamValue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_FORMAT +{{OPT}}urMemImageGetInfoTest.InvalidNullPointerParamValue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_ELEMENT_SIZE +{{OPT}}urMemImageGetInfoTest.InvalidNullPointerParamValue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_ROW_PITCH +{{OPT}}urMemImageGetInfoTest.InvalidNullPointerParamValue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_SLICE_PITCH +{{OPT}}urMemImageGetInfoTest.InvalidNullPointerParamValue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_WIDTH +{{OPT}}urMemImageGetInfoTest.InvalidNullPointerParamValue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_HEIGHT +{{OPT}}urMemImageGetInfoTest.InvalidNullPointerParamValue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_DEPTH +{{OPT}}urMemImageGetInfoTest.InvalidNullPointerPropSizeRet/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_FORMAT +{{OPT}}urMemImageGetInfoTest.InvalidNullPointerPropSizeRet/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_ELEMENT_SIZE +{{OPT}}urMemImageGetInfoTest.InvalidNullPointerPropSizeRet/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_ROW_PITCH +{{OPT}}urMemImageGetInfoTest.InvalidNullPointerPropSizeRet/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_SLICE_PITCH +{{OPT}}urMemImageGetInfoTest.InvalidNullPointerPropSizeRet/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_WIDTH +{{OPT}}urMemImageGetInfoTest.InvalidNullPointerPropSizeRet/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_HEIGHT +{{OPT}}urMemImageGetInfoTest.InvalidNullPointerPropSizeRet/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_IMAGE_INFO_DEPTH diff --git a/test/conformance/memory/memory_adapter_native_cpu.match b/test/conformance/memory/memory_adapter_native_cpu.match index 2fb0814324..5bdd88804b 100644 --- a/test/conformance/memory/memory_adapter_native_cpu.match +++ b/test/conformance/memory/memory_adapter_native_cpu.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} urMemBufferPartitionTest.InvalidValueCreateType/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} urMemBufferPartitionTest.InvalidValueBufferCreateInfoOutOfBounds/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} urMemGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_MEM_INFO_SIZE diff --git a/test/conformance/memory/memory_adapter_opencl.match b/test/conformance/memory/memory_adapter_opencl.match index 23dfbbae8c..b57e3876d0 100644 --- a/test/conformance/memory/memory_adapter_opencl.match +++ b/test/conformance/memory/memory_adapter_opencl.match @@ -1 +1,2 @@ +{{NONDETERMINISTIC}} urMemImageCreateTest.InvalidImageDescStype/Intel_R__OpenCL___{{.*}} diff --git a/test/conformance/memory/urMemImageCreate.cpp b/test/conformance/memory/urMemImageCreate.cpp index ea210a921f..28d5d9c4e3 100644 --- a/test/conformance/memory/urMemImageCreate.cpp +++ b/test/conformance/memory/urMemImageCreate.cpp @@ -26,10 +26,10 @@ struct urMemImageCreateTest : public uur::urContextTest { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(uur::urContextTest::SetUp()); - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; auto ret = urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, - &image_desc, nullptr, &image_handle); + &image_desc, nullptr, image_handle.ptr()); if (ret == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { GTEST_SKIP() << "urMemImageCreate not supported"; @@ -50,10 +50,10 @@ struct urMemImageCreateTestWithParam UUR_RETURN_ON_FATAL_FAILURE( uur::urContextTestWithParam::SetUp()); - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; auto ret = urMemImageCreate(this->context, UR_MEM_FLAG_READ_WRITE, &image_format, &image_desc, nullptr, - &image_handle); + image_handle.ptr()); if (ret == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { GTEST_SKIP() << "urMemImageCreate not supported"; @@ -89,12 +89,11 @@ TEST_P(urMemImageCreateTestWith1DMemoryTypeParam, Success) { 0 ///< [in] number of samples }; - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ASSERT_SUCCESS(urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &image_desc_with_param, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); ASSERT_NE(nullptr, image_handle); - ASSERT_SUCCESS(urMemRelease(image_handle)); } using urMemImageCreateTestWith2DMemoryTypeParam = @@ -120,12 +119,11 @@ TEST_P(urMemImageCreateTestWith2DMemoryTypeParam, Success) { 0 ///< [in] number of samples }; - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ASSERT_SUCCESS(urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &image_desc_with_param, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); ASSERT_NE(nullptr, image_handle); - ASSERT_SUCCESS(urMemRelease(image_handle)); } TEST_P(urMemImageCreateTest, SuccessWith3DImageType) { @@ -143,28 +141,27 @@ TEST_P(urMemImageCreateTest, SuccessWith3DImageType) { 0 ///< [in] number of samples }; - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ASSERT_SUCCESS(urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &image_desc_with_param, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); ASSERT_NE(nullptr, image_handle); - ASSERT_SUCCESS(urMemRelease(image_handle)); } TEST_P(urMemImageCreateTest, InvalidNullHandleContext) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE, urMemImageCreate(nullptr, UR_MEM_FLAG_READ_WRITE, &image_format, &image_desc, nullptr, - &image_handle)); + image_handle.ptr())); } TEST_P(urMemImageCreateTest, InvalidEnumerationFlags) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_ENUMERATION, urMemImageCreate(context, UR_MEM_FLAG_FORCE_UINT32, &image_format, &image_desc, nullptr, - &image_handle)); + image_handle.ptr())); } TEST_P(urMemImageCreateTest, InvalidNullPointerBuffer) { @@ -175,23 +172,24 @@ TEST_P(urMemImageCreateTest, InvalidNullPointerBuffer) { } TEST_P(urMemImageCreateTest, InvalidNullPointerImageDesc) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, nullptr, nullptr, - &image_handle)); + image_handle.ptr())); } TEST_P(urMemImageCreateTest, InvalidNullPointerImageFormat) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, nullptr, - &image_desc, nullptr, &image_handle)); + &image_desc, nullptr, + image_handle.ptr())); } TEST_P(urMemImageCreateTest, InvalidSize) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ur_image_desc_t invalid_image_desc = image_desc; invalid_image_desc.width = std::numeric_limits::max(); @@ -199,7 +197,7 @@ TEST_P(urMemImageCreateTest, InvalidSize) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_SIZE, urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &invalid_image_desc, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); invalid_image_desc = image_desc; invalid_image_desc.height = std::numeric_limits::max(); @@ -207,7 +205,7 @@ TEST_P(urMemImageCreateTest, InvalidSize) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_SIZE, urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &invalid_image_desc, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); invalid_image_desc = image_desc; invalid_image_desc.depth = std::numeric_limits::max(); @@ -215,21 +213,21 @@ TEST_P(urMemImageCreateTest, InvalidSize) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_SIZE, urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &invalid_image_desc, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); } TEST_P(urMemImageCreateTest, InvalidImageDescStype) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ur_image_desc_t invalid_image_desc = image_desc; invalid_image_desc.stype = UR_STRUCTURE_TYPE_FORCE_UINT32; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR, urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &invalid_image_desc, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); } TEST_P(urMemImageCreateTest, InvalidImageDescType) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ur_image_desc_t invalid_image_desc = image_desc; invalid_image_desc.type = UR_MEM_TYPE_FORCE_UINT32; @@ -237,11 +235,11 @@ TEST_P(urMemImageCreateTest, InvalidImageDescType) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR, urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &invalid_image_desc, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); } TEST_P(urMemImageCreateTest, InvalidImageDescNumMipLevel) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ur_image_desc_t invalid_image_desc = image_desc; invalid_image_desc.numMipLevel = 1; /* Must be 0 */ @@ -249,11 +247,11 @@ TEST_P(urMemImageCreateTest, InvalidImageDescNumMipLevel) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR, urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &invalid_image_desc, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); } TEST_P(urMemImageCreateTest, InvalidImageDescNumSamples) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ur_image_desc_t invalid_image_desc = image_desc; invalid_image_desc.numSamples = 1; /* Must be 0 */ @@ -261,11 +259,11 @@ TEST_P(urMemImageCreateTest, InvalidImageDescNumSamples) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR, urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &invalid_image_desc, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); } TEST_P(urMemImageCreateTest, InvalidImageDescRowPitch) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ur_image_desc_t invalid_image_desc = image_desc; invalid_image_desc.rowPitch = 1; /* Must be 0 if pHost is NULL */ @@ -273,11 +271,11 @@ TEST_P(urMemImageCreateTest, InvalidImageDescRowPitch) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR, urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &invalid_image_desc, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); } TEST_P(urMemImageCreateTest, InvalidImageDescSlicePitch) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ur_image_desc_t invalid_image_desc = image_desc; invalid_image_desc.slicePitch = 1; /* Must be 0 if pHost is NULL */ @@ -285,7 +283,7 @@ TEST_P(urMemImageCreateTest, InvalidImageDescSlicePitch) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR, urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, &invalid_image_desc, - nullptr, &image_handle)); + nullptr, image_handle.ptr())); } using urMemImageCreateWithHostPtrFlagsTest = @@ -310,8 +308,9 @@ TEST_P(urMemImageCreateWithHostPtrFlagsTest, Success) { } TEST_P(urMemImageCreateWithHostPtrFlagsTest, InvalidHostPtr) { - ur_mem_handle_t image_handle = nullptr; + uur::raii::Mem image_handle = nullptr; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_HOST_PTR, urMemImageCreate(context, getParam(), &image_format, - &image_desc, nullptr, &image_handle)); + &image_desc, nullptr, + image_handle.ptr())); } diff --git a/test/conformance/platform/platform_adapter_cuda.match b/test/conformance/platform/platform_adapter_cuda.match index b459b89bbe..7806019709 100644 --- a/test/conformance/platform/platform_adapter_cuda.match +++ b/test/conformance/platform/platform_adapter_cuda.match @@ -1 +1,2 @@ +{{NONDETERMINISTIC}} urPlatformCreateWithNativeHandleTest.InvalidNullPointerPlatform diff --git a/test/conformance/platform/platform_adapter_hip.match b/test/conformance/platform/platform_adapter_hip.match index b459b89bbe..7806019709 100644 --- a/test/conformance/platform/platform_adapter_hip.match +++ b/test/conformance/platform/platform_adapter_hip.match @@ -1 +1,2 @@ +{{NONDETERMINISTIC}} urPlatformCreateWithNativeHandleTest.InvalidNullPointerPlatform diff --git a/test/conformance/platform/platform_adapter_level_zero-v2.match b/test/conformance/platform/platform_adapter_level_zero-v2.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/platform/platform_adapter_level_zero.match b/test/conformance/platform/platform_adapter_level_zero.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/platform/platform_adapter_native_cpu.match b/test/conformance/platform/platform_adapter_native_cpu.match index 257822d30b..7806019709 100644 --- a/test/conformance/platform/platform_adapter_native_cpu.match +++ b/test/conformance/platform/platform_adapter_native_cpu.match @@ -1,6 +1,2 @@ +{{NONDETERMINISTIC}} urPlatformCreateWithNativeHandleTest.InvalidNullPointerPlatform -urPlatfromGetBackendOptionTest.InvalidValueFrontendOption -urPlatfromGetBackendOptionTestWithParam.Success/_O0 -urPlatfromGetBackendOptionTestWithParam.Success/_O1 -urPlatfromGetBackendOptionTestWithParam.Success/_O2 -urPlatfromGetBackendOptionTestWithParam.Success/_O3 diff --git a/test/conformance/platform/platform_adapter_opencl.match b/test/conformance/platform/platform_adapter_opencl.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/program/program_adapter_cuda.match b/test/conformance/program/program_adapter_cuda.match index 5ffc32bb03..fac749462e 100644 --- a/test/conformance/program/program_adapter_cuda.match +++ b/test/conformance/program/program_adapter_cuda.match @@ -1,13 +1,14 @@ +{{NONDETERMINISTIC}} urProgramBuildTest.BuildFailure/NVIDIA_CUDA_BACKEND___{{.*}}_ {{OPT}}urProgramCreateWithILTest.Success/NVIDIA_CUDA_BACKEND___{{.*}} {{OPT}}urProgramCreateWithILTest.SuccessWithProperties/NVIDIA_CUDA_BACKEND___{{.*}} {{OPT}}urProgramCreateWithILTest.BuildInvalidProgram/NVIDIA_CUDA_BACKEND___{{.*}} # This test flakily fails {{OPT}}urProgramGetBuildInfoSingleTest.LogIsNullTerminated/NVIDIA_CUDA_BACKEND___{{.*}} -# CUDA doesn't expose kernel numbers or names -urProgramGetInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_NUM_KERNELS -urProgramGetInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_KERNEL_NAMES {{OPT}}urProgramSetSpecializationConstantsTest.Success/NVIDIA_CUDA_BACKEND___{{.*}} {{OPT}}urProgramSetSpecializationConstantsTest.UseDefaultValue/NVIDIA_CUDA_BACKEND___{{.*}} -urProgramSetMultipleSpecializationConstantsTest.MultipleCalls/NVIDIA_CUDA_BACKEND___{{.*}} -urProgramSetMultipleSpecializationConstantsTest.SingleCall/NVIDIA_CUDA_BACKEND___{{.*}} +urProgramSetSpecializationConstantsTest.InvalidValueSize/NVIDIA_CUDA_BACKEND___{{.*}}_ +urProgramSetSpecializationConstantsTest.InvalidValueId/NVIDIA_CUDA_BACKEND___{{.*}}_ +urProgramSetSpecializationConstantsTest.InvalidValuePtr/NVIDIA_CUDA_BACKEND___{{.*}}_ +urProgramSetMultipleSpecializationConstantsTest.MultipleCalls/NVIDIA_CUDA_BACKEND___{{.*}}_ +urProgramSetMultipleSpecializationConstantsTest.SingleCall/NVIDIA_CUDA_BACKEND___{{.*}}_ diff --git a/test/conformance/program/program_adapter_hip.match b/test/conformance/program/program_adapter_hip.match index 183d88342d..2f93f09660 100644 --- a/test/conformance/program/program_adapter_hip.match +++ b/test/conformance/program/program_adapter_hip.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} urProgramBuildTest.BuildFailure/AMD_HIP_BACKEND___{{.*}}_ # HIP hasn't implemented urProgramCreateWithNativeHandleTest {{OPT}}urProgramCreateWithNativeHandleTest.Success/AMD_HIP_BACKEND___{{.*}}_ @@ -8,7 +9,11 @@ urProgramGetInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_KERNEL_N # HIP hasn't implemented urProgramLink {{OPT}}urProgramLinkTest.Success/AMD_HIP_BACKEND___{{.*}}_ +# Hip doesn't support specialization constants urProgramSetSpecializationConstantsTest.Success/AMD_HIP_BACKEND___{{.*}}_ urProgramSetSpecializationConstantsTest.UseDefaultValue/AMD_HIP_BACKEND___{{.*}}_ +urProgramSetSpecializationConstantsTest.InvalidValueSize/AMD_HIP_BACKEND___{{.*}}_ +urProgramSetSpecializationConstantsTest.InvalidValueId/AMD_HIP_BACKEND___{{.*}}_ +urProgramSetSpecializationConstantsTest.InvalidValuePtr/AMD_HIP_BACKEND___{{.*}}_ urProgramSetMultipleSpecializationConstantsTest.MultipleCalls/AMD_HIP_BACKEND___{{.*}}_ urProgramSetMultipleSpecializationConstantsTest.SingleCall/AMD_HIP_BACKEND___{{.*}}_ diff --git a/test/conformance/program/program_adapter_level_zero-v2.match b/test/conformance/program/program_adapter_level_zero-v2.match deleted file mode 100644 index 05b71211b8..0000000000 --- a/test/conformance/program/program_adapter_level_zero-v2.match +++ /dev/null @@ -1,6 +0,0 @@ -urProgramCreateWithNativeHandleTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urProgramCreateWithNativeHandleTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urProgramCreateWithNativeHandleTest.InvalidNullPointerProgram/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urProgramGetBuildInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_PROGRAM_BUILD_INFO_STATUS -urProgramGetFunctionPointerTest.InvalidKernelName/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -Aborted diff --git a/test/conformance/program/program_adapter_level_zero.match b/test/conformance/program/program_adapter_level_zero.match index 9e902dca94..445f7e6fbd 100644 --- a/test/conformance/program/program_adapter_level_zero.match +++ b/test/conformance/program/program_adapter_level_zero.match @@ -1,6 +1,12 @@ +{{NONDETERMINISTIC}} urProgramCreateWithNativeHandleTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urProgramCreateWithNativeHandleTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urProgramCreateWithNativeHandleTest.InvalidNullPointerProgram/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urProgramGetBuildInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_UR_PROGRAM_BUILD_INFO_STATUS urProgramGetFunctionPointerTest.InvalidKernelName/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -Aborted +urProgramGetNativeHandleTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +{{OPT}}urProgramLinkErrorTest.LinkFailure/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +{{OPT}}urProgramLinkErrorTest.SetOutputOnLinkError/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urProgramSetSpecializationConstantsTest.InvalidValueSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}} +urProgramSetSpecializationConstantsTest.InvalidValueId/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}} +urProgramSetSpecializationConstantsTest.InvalidValuePtr/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}} diff --git a/test/conformance/program/program_adapter_level_zero_v2.match b/test/conformance/program/program_adapter_level_zero_v2.match new file mode 100644 index 0000000000..2c5b6500c3 --- /dev/null +++ b/test/conformance/program/program_adapter_level_zero_v2.match @@ -0,0 +1,12 @@ +{{NONDETERMINISTIC}} +urProgramCreateWithNativeHandleTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urProgramCreateWithNativeHandleTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urProgramCreateWithNativeHandleTest.InvalidNullPointerProgram/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urProgramGetBuildInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_PROGRAM_BUILD_INFO_STATUS +urProgramGetFunctionPointerTest.InvalidKernelName/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urProgramGetNativeHandleTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +{{OPT}}urProgramLinkErrorTest.LinkFailure/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +{{OPT}}urProgramLinkErrorTest.SetOutputOnLinkError/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urProgramSetSpecializationConstantsTest.InvalidValueSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}} +urProgramSetSpecializationConstantsTest.InvalidValueId/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urProgramSetSpecializationConstantsTest.InvalidValuePtr/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ diff --git a/test/conformance/program/program_adapter_native_cpu.match b/test/conformance/program/program_adapter_native_cpu.match index cf3fa7062d..55f467c42b 100644 --- a/test/conformance/program/program_adapter_native_cpu.match +++ b/test/conformance/program/program_adapter_native_cpu.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} {{OPT}}urProgramBuildTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}urProgramBuildTest.SuccessWithOptions/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}urProgramBuildTest.InvalidNullHandleContext/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} @@ -57,7 +58,7 @@ {{OPT}}urProgramGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_CONTEXT {{OPT}}urProgramGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_NUM_DEVICES {{OPT}}urProgramGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_DEVICES -{{OPT}}urProgramGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_SOURCE +{{OPT}}urProgramGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_IL {{OPT}}urProgramGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_BINARY_SIZES {{OPT}}urProgramGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_BINARIES {{OPT}}urProgramGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_NUM_KERNELS @@ -66,7 +67,7 @@ {{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_CONTEXT {{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_NUM_DEVICES {{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_DEVICES -{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_SOURCE +{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_IL {{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_BINARY_SIZES {{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_BINARIES {{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_NUM_KERNELS @@ -75,7 +76,7 @@ {{OPT}}urProgramGetInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_CONTEXT {{OPT}}urProgramGetInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_NUM_DEVICES {{OPT}}urProgramGetInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_DEVICES -{{OPT}}urProgramGetInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_SOURCE +{{OPT}}urProgramGetInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_IL {{OPT}}urProgramGetInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_BINARY_SIZES {{OPT}}urProgramGetInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_BINARIES {{OPT}}urProgramGetInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_NUM_KERNELS @@ -84,7 +85,7 @@ {{OPT}}urProgramGetInfoTest.InvalidSizeZero/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_CONTEXT {{OPT}}urProgramGetInfoTest.InvalidSizeZero/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_NUM_DEVICES {{OPT}}urProgramGetInfoTest.InvalidSizeZero/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_DEVICES -{{OPT}}urProgramGetInfoTest.InvalidSizeZero/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_SOURCE +{{OPT}}urProgramGetInfoTest.InvalidSizeZero/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_IL {{OPT}}urProgramGetInfoTest.InvalidSizeZero/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_BINARY_SIZES {{OPT}}urProgramGetInfoTest.InvalidSizeZero/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_BINARIES {{OPT}}urProgramGetInfoTest.InvalidSizeZero/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_NUM_KERNELS @@ -93,7 +94,7 @@ {{OPT}}urProgramGetInfoTest.InvalidSizeSmall/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_CONTEXT {{OPT}}urProgramGetInfoTest.InvalidSizeSmall/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_NUM_DEVICES {{OPT}}urProgramGetInfoTest.InvalidSizeSmall/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_DEVICES -{{OPT}}urProgramGetInfoTest.InvalidSizeSmall/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_SOURCE +{{OPT}}urProgramGetInfoTest.InvalidSizeSmall/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_IL {{OPT}}urProgramGetInfoTest.InvalidSizeSmall/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_BINARY_SIZES {{OPT}}urProgramGetInfoTest.InvalidSizeSmall/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_BINARIES {{OPT}}urProgramGetInfoTest.InvalidSizeSmall/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_NUM_KERNELS @@ -102,7 +103,7 @@ {{OPT}}urProgramGetInfoTest.InvalidNullPointerPropValue/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_CONTEXT {{OPT}}urProgramGetInfoTest.InvalidNullPointerPropValue/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_NUM_DEVICES {{OPT}}urProgramGetInfoTest.InvalidNullPointerPropValue/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_DEVICES -{{OPT}}urProgramGetInfoTest.InvalidNullPointerPropValue/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_SOURCE +{{OPT}}urProgramGetInfoTest.InvalidNullPointerPropValue/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_IL {{OPT}}urProgramGetInfoTest.InvalidNullPointerPropValue/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_BINARY_SIZES {{OPT}}urProgramGetInfoTest.InvalidNullPointerPropValue/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_BINARIES {{OPT}}urProgramGetInfoTest.InvalidNullPointerPropValue/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_NUM_KERNELS @@ -111,7 +112,7 @@ {{OPT}}urProgramGetInfoTest.InvalidNullPointerPropValueRet/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_CONTEXT {{OPT}}urProgramGetInfoTest.InvalidNullPointerPropValueRet/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_NUM_DEVICES {{OPT}}urProgramGetInfoTest.InvalidNullPointerPropValueRet/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_DEVICES -{{OPT}}urProgramGetInfoTest.InvalidNullPointerPropValueRet/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_SOURCE +{{OPT}}urProgramGetInfoTest.InvalidNullPointerPropValueRet/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_IL {{OPT}}urProgramGetInfoTest.InvalidNullPointerPropValueRet/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_BINARY_SIZES {{OPT}}urProgramGetInfoTest.InvalidNullPointerPropValueRet/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_BINARIES {{OPT}}urProgramGetInfoTest.InvalidNullPointerPropValueRet/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_PROGRAM_INFO_NUM_KERNELS @@ -139,6 +140,9 @@ {{OPT}}urProgramSetSpecializationConstantsTest.InvalidNullHandleProgram/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}urProgramSetSpecializationConstantsTest.InvalidNullPointerSpecConstants/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}urProgramSetSpecializationConstantsTest.InvalidSizeCount/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}urProgramSetSpecializationConstantsTest.InvalidValueSize/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}urProgramSetSpecializationConstantsTest.InvalidValueId/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}urProgramSetSpecializationConstantsTest.InvalidValuePtr/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}urProgramSetMultipleSpecializationConstantsTest.MultipleCalls/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}urProgramSetMultipleSpecializationConstantsTest.SingleCall/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}{{Segmentation fault|Aborted}} diff --git a/test/conformance/program/program_adapter_opencl.match b/test/conformance/program/program_adapter_opencl.match index a4b56d4f94..180e88e9be 100644 --- a/test/conformance/program/program_adapter_opencl.match +++ b/test/conformance/program/program_adapter_opencl.match @@ -1,2 +1,2 @@ +{{NONDETERMINISTIC}} urProgramCreateWithILTest.BuildInvalidProgram/Intel_R__OpenCL___{{.*}}_ -urProgramGetInfoTest.Success/Intel_R__OpenCL___{{.*}}___UR_PROGRAM_INFO_SOURCE diff --git a/test/conformance/program/urProgramCreateWithIL.cpp b/test/conformance/program/urProgramCreateWithIL.cpp index 3d81d14104..7c02c3c7b9 100644 --- a/test/conformance/program/urProgramCreateWithIL.cpp +++ b/test/conformance/program/urProgramCreateWithIL.cpp @@ -37,7 +37,8 @@ TEST_P(urProgramCreateWithILTest, Success) { } TEST_P(urProgramCreateWithILTest, SuccessWithProperties) { - ur_program_properties_t properties{UR_STRUCTURE_TYPE_PROGRAM_PROPERTIES}; + ur_program_properties_t properties{UR_STRUCTURE_TYPE_PROGRAM_PROPERTIES, + nullptr, 0, nullptr}; ur_program_handle_t program = nullptr; ASSERT_SUCCESS(urProgramCreateWithIL( context, il_binary->data(), il_binary->size(), &properties, &program)); diff --git a/test/conformance/program/urProgramGetInfo.cpp b/test/conformance/program/urProgramGetInfo.cpp index c9cdb7b066..7b2e6f1873 100644 --- a/test/conformance/program/urProgramGetInfo.cpp +++ b/test/conformance/program/urProgramGetInfo.cpp @@ -18,7 +18,7 @@ UUR_TEST_SUITE_P( urProgramGetInfoTest, ::testing::Values(UR_PROGRAM_INFO_REFERENCE_COUNT, UR_PROGRAM_INFO_CONTEXT, UR_PROGRAM_INFO_NUM_DEVICES, UR_PROGRAM_INFO_DEVICES, - UR_PROGRAM_INFO_SOURCE, UR_PROGRAM_INFO_BINARY_SIZES, + UR_PROGRAM_INFO_IL, UR_PROGRAM_INFO_BINARY_SIZES, UR_PROGRAM_INFO_BINARIES, UR_PROGRAM_INFO_NUM_KERNELS, UR_PROGRAM_INFO_KERNEL_NAMES), uur::deviceTestWithParamPrinter); @@ -52,8 +52,12 @@ TEST_P(urProgramGetInfoTest, Success) { sizeof(binaries[0]), binaries, nullptr)); } else { - ASSERT_SUCCESS(urProgramGetInfo(program, property_name, 0, nullptr, - &property_size)); + auto result = urProgramGetInfo(program, property_name, 0, nullptr, + &property_size); + if (result != UR_RESULT_SUCCESS) { + ASSERT_EQ_RESULT(result, UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION); + return; + } property_value.resize(property_size); ASSERT_SUCCESS(urProgramGetInfo(program, property_name, property_size, property_value.data(), nullptr)); @@ -103,6 +107,10 @@ TEST_P(urProgramGetInfoTest, Success) { ASSERT_STRNE(returned_kernel_names, ""); break; } + case UR_PROGRAM_INFO_IL: { + ASSERT_EQ(property_value, *il_binary.get()); + break; + } default: break; } diff --git a/test/conformance/program/urProgramSetSpecializationConstants.cpp b/test/conformance/program/urProgramSetSpecializationConstants.cpp index 6d5b70322f..949c5c323e 100644 --- a/test/conformance/program/urProgramSetSpecializationConstants.cpp +++ b/test/conformance/program/urProgramSetSpecializationConstants.cpp @@ -141,3 +141,26 @@ TEST_P(urProgramSetSpecializationConstantsTest, InvalidSizeCount) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_SIZE, urProgramSetSpecializationConstants(program, 0, &info)); } + +TEST_P(urProgramSetSpecializationConstantsTest, InvalidValueSize) { + ur_specialization_constant_info_t bad_info = {0, 0x1000, &spec_value}; + ASSERT_EQ_RESULT( + UR_RESULT_ERROR_INVALID_VALUE, + urProgramSetSpecializationConstants(program, 1, &bad_info)); +} + +TEST_P(urProgramSetSpecializationConstantsTest, InvalidValueId) { + ur_specialization_constant_info_t bad_info = {999, sizeof(spec_value), + &spec_value}; + ASSERT_EQ_RESULT( + UR_RESULT_ERROR_INVALID_SPEC_ID, + urProgramSetSpecializationConstants(program, 1, &bad_info)); +} + +TEST_P(urProgramSetSpecializationConstantsTest, InvalidValuePtr) { + ur_specialization_constant_info_t bad_info = {0, sizeof(spec_value), + nullptr}; + ASSERT_EQ_RESULT( + UR_RESULT_ERROR_INVALID_VALUE, + urProgramSetSpecializationConstants(program, 1, &bad_info)); +} diff --git a/test/conformance/queue/queue_adapter_cuda.match b/test/conformance/queue/queue_adapter_cuda.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/queue/queue_adapter_hip.match b/test/conformance/queue/queue_adapter_hip.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/queue/queue_adapter_level_zero-v2.match b/test/conformance/queue/queue_adapter_level_zero-v2.match deleted file mode 100644 index 6370cfe9a1..0000000000 --- a/test/conformance/queue/queue_adapter_level_zero-v2.match +++ /dev/null @@ -1,12 +0,0 @@ -urQueueCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urQueueCreateTest.CheckContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urQueueFinishTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urQueueFlushTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urQueueGetInfoTestWithInfoParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_QUEUE_INFO_CONTEXT -urQueueGetInfoTestWithInfoParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_QUEUE_INFO_DEVICE -urQueueGetInfoTestWithInfoParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_QUEUE_INFO_FLAGS -urQueueGetInfoTestWithInfoParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_QUEUE_INFO_REFERENCE_COUNT -urQueueGetInfoTestWithInfoParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_QUEUE_INFO_EMPTY -urQueueGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urQueueRetainTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urQueueReleaseTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ diff --git a/test/conformance/queue/queue_adapter_level_zero.match b/test/conformance/queue/queue_adapter_level_zero.match deleted file mode 100644 index 8b13789179..0000000000 --- a/test/conformance/queue/queue_adapter_level_zero.match +++ /dev/null @@ -1 +0,0 @@ - diff --git a/test/conformance/queue/queue_adapter_native_cpu.match b/test/conformance/queue/queue_adapter_native_cpu.match index 1d17a6fa38..32ea573390 100644 --- a/test/conformance/queue/queue_adapter_native_cpu.match +++ b/test/conformance/queue/queue_adapter_native_cpu.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} urQueueCreateTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} urQueueCreateTest.CheckContext/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE diff --git a/test/conformance/queue/queue_adapter_opencl.match b/test/conformance/queue/queue_adapter_opencl.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/sampler/sampler_adapter_cuda.match b/test/conformance/sampler/sampler_adapter_cuda.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/sampler/sampler_adapter_hip.match b/test/conformance/sampler/sampler_adapter_hip.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/sampler/sampler_adapter_level_zero.match b/test/conformance/sampler/sampler_adapter_level_zero.match index 1508bd1f8b..f1b3485529 100644 --- a/test/conformance/sampler/sampler_adapter_level_zero.match +++ b/test/conformance/sampler/sampler_adapter_level_zero.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} {{OPT}}urSamplerGetInfoTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_SAMPLER_INFO_REFERENCE_COUNT {{OPT}}urSamplerGetInfoTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_SAMPLER_INFO_CONTEXT {{OPT}}urSamplerGetInfoTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_SAMPLER_INFO_NORMALIZED_COORDS diff --git a/test/conformance/sampler/sampler_adapter_level_zero-v2.match b/test/conformance/sampler/sampler_adapter_level_zero_v2.match similarity index 97% rename from test/conformance/sampler/sampler_adapter_level_zero-v2.match rename to test/conformance/sampler/sampler_adapter_level_zero_v2.match index 1508bd1f8b..f1b3485529 100644 --- a/test/conformance/sampler/sampler_adapter_level_zero-v2.match +++ b/test/conformance/sampler/sampler_adapter_level_zero_v2.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} {{OPT}}urSamplerGetInfoTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_SAMPLER_INFO_REFERENCE_COUNT {{OPT}}urSamplerGetInfoTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_SAMPLER_INFO_CONTEXT {{OPT}}urSamplerGetInfoTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_SAMPLER_INFO_NORMALIZED_COORDS diff --git a/test/conformance/sampler/sampler_adapter_native_cpu.match b/test/conformance/sampler/sampler_adapter_native_cpu.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/sampler/sampler_adapter_opencl.match b/test/conformance/sampler/sampler_adapter_opencl.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/sampler/urSamplerCreate.cpp b/test/conformance/sampler/urSamplerCreate.cpp index 75f6b0da4b..ceb61247e1 100644 --- a/test/conformance/sampler/urSamplerCreate.cpp +++ b/test/conformance/sampler/urSamplerCreate.cpp @@ -95,6 +95,7 @@ TEST_P(urSamplerCreateTest, InvalidEnumerationAddrMode) { ASSERT_EQ_RESULT(urSamplerCreate(context, &sampler_desc, hSampler.ptr()), UR_RESULT_ERROR_INVALID_ENUMERATION); } + TEST_P(urSamplerCreateTest, InvalidEnumerationFilterMode) { ur_sampler_desc_t sampler_desc{ UR_STRUCTURE_TYPE_SAMPLER_DESC, /* stype */ @@ -107,3 +108,19 @@ TEST_P(urSamplerCreateTest, InvalidEnumerationFilterMode) { ASSERT_EQ_RESULT(urSamplerCreate(context, &sampler_desc, hSampler.ptr()), UR_RESULT_ERROR_INVALID_ENUMERATION); } + +TEST_P(urSamplerCreateTest, InvalidNullPointer) { + ur_sampler_desc_t sampler_desc{ + UR_STRUCTURE_TYPE_SAMPLER_DESC, /* stype */ + nullptr, /* pNext */ + true, /* normalizedCoords */ + UR_SAMPLER_ADDRESSING_MODE_CLAMP, /* addressing mode */ + UR_SAMPLER_FILTER_MODE_FORCE_UINT32, /* filterMode */ + }; + uur::raii::Sampler hSampler = nullptr; + ASSERT_EQ_RESULT(urSamplerCreate(context, nullptr, hSampler.ptr()), + UR_RESULT_ERROR_INVALID_NULL_POINTER); + + ASSERT_EQ_RESULT(urSamplerCreate(context, &sampler_desc, nullptr), + UR_RESULT_ERROR_INVALID_NULL_POINTER); +} diff --git a/test/conformance/sampler/urSamplerCreateWithNativeHandle.cpp b/test/conformance/sampler/urSamplerCreateWithNativeHandle.cpp index bad69a16d8..59638105c9 100644 --- a/test/conformance/sampler/urSamplerCreateWithNativeHandle.cpp +++ b/test/conformance/sampler/urSamplerCreateWithNativeHandle.cpp @@ -3,6 +3,7 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include "uur/raii.h" #include using urSamplerCreateWithNativeHandleTest = uur::urSamplerTest; @@ -32,3 +33,84 @@ TEST_P(urSamplerCreateWithNativeHandleTest, Success) { ASSERT_EQ(addr_mode, sampler_desc.addressingMode); ASSERT_SUCCESS(urSamplerRelease(hSampler)); } + +TEST_P(urSamplerCreateWithNativeHandleTest, InvalidNullHandle) { + ur_native_handle_t native_sampler = 0; + { + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urSamplerGetNativeHandle(sampler, &native_sampler)); + } + + ur_sampler_handle_t hSampler = nullptr; + ur_sampler_native_properties_t props{}; + ASSERT_EQ(urSamplerCreateWithNativeHandle(native_sampler, nullptr, &props, + &hSampler), + UR_RESULT_ERROR_INVALID_NULL_HANDLE); +} + +TEST_P(urSamplerCreateWithNativeHandleTest, InvalidNullPointer) { + ur_native_handle_t native_sampler = 0; + { + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urSamplerGetNativeHandle(sampler, &native_sampler)); + } + + ur_sampler_native_properties_t props{}; + ASSERT_EQ(urSamplerCreateWithNativeHandle(native_sampler, context, &props, + nullptr), + UR_RESULT_ERROR_INVALID_NULL_POINTER); +} + +TEST_P(urSamplerCreateWithNativeHandleTest, SuccessWithOwnedNativeHandle) { + + ur_native_handle_t native_handle = 0; + uur::raii::Sampler hSampler = nullptr; + { + ur_sampler_desc_t sampler_desc{ + UR_STRUCTURE_TYPE_SAMPLER_DESC, /* stype */ + nullptr, /* pNext */ + true, /* normalizedCoords */ + UR_SAMPLER_ADDRESSING_MODE_NONE, /* addressing mode */ + UR_SAMPLER_FILTER_MODE_NEAREST, /* filterMode */ + }; + + ASSERT_SUCCESS(urSamplerCreate(context, &sampler_desc, hSampler.ptr())); + + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urSamplerGetNativeHandle(hSampler, &native_handle)); + } + + ur_sampler_native_properties_t props = { + UR_STRUCTURE_TYPE_SAMPLER_NATIVE_PROPERTIES, nullptr, true}; + ur_sampler_handle_t sampler = nullptr; + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urSamplerCreateWithNativeHandle( + native_handle, context, &props, &sampler)); + ASSERT_NE(sampler, nullptr); +} + +TEST_P(urSamplerCreateWithNativeHandleTest, SuccessWithUnOwnedNativeHandle) { + + ur_native_handle_t native_handle = 0; + uur::raii::Sampler hSampler = nullptr; + { + ur_sampler_desc_t sampler_desc{ + UR_STRUCTURE_TYPE_SAMPLER_DESC, /* stype */ + nullptr, /* pNext */ + true, /* normalizedCoords */ + UR_SAMPLER_ADDRESSING_MODE_NONE, /* addressing mode */ + UR_SAMPLER_FILTER_MODE_NEAREST, /* filterMode */ + }; + + ASSERT_SUCCESS(urSamplerCreate(context, &sampler_desc, hSampler.ptr())); + + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urSamplerGetNativeHandle(hSampler, &native_handle)); + } + + ur_sampler_native_properties_t props = { + UR_STRUCTURE_TYPE_SAMPLER_NATIVE_PROPERTIES, nullptr, false}; + ur_sampler_handle_t sampler = nullptr; + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urSamplerCreateWithNativeHandle( + native_handle, context, &props, &sampler)); + ASSERT_NE(sampler, nullptr); +} diff --git a/test/conformance/source/environment.cpp b/test/conformance/source/environment.cpp index 05dbe8c847..ec339a5f40 100644 --- a/test/conformance/source/environment.cpp +++ b/test/conformance/source/environment.cpp @@ -6,8 +6,11 @@ #include #include #include +#include +#include "ur_api.h" #include "ur_filesystem_resolved.hpp" +#include "uur/checks.h" #ifdef KERNELS_ENVIRONMENT #include "kernel_entry_points.h" @@ -23,16 +26,37 @@ constexpr char ERROR_NO_ADAPTER[] = "Could not load adapter"; PlatformEnvironment *PlatformEnvironment::instance = nullptr; -std::ostream &operator<<(std::ostream &out, - const ur_platform_handle_t &platform) { - size_t size; - urPlatformGetInfo(platform, UR_PLATFORM_INFO_NAME, 0, nullptr, &size); - std::vector name(size); - urPlatformGetInfo(platform, UR_PLATFORM_INFO_NAME, size, name.data(), - nullptr); - out << name.data(); - return out; -} +constexpr std::pair backends[] = { + {"LEVEL_ZERO", UR_PLATFORM_BACKEND_LEVEL_ZERO}, + {"L0", UR_PLATFORM_BACKEND_LEVEL_ZERO}, + {"OPENCL", UR_PLATFORM_BACKEND_OPENCL}, + {"CUDA", UR_PLATFORM_BACKEND_CUDA}, + {"HIP", UR_PLATFORM_BACKEND_HIP}, + {"NATIVE_CPU", UR_PLATFORM_BACKEND_NATIVE_CPU}, + {"UNKNOWN", UR_PLATFORM_BACKEND_UNKNOWN}, +}; + +namespace { +constexpr const char *backend_to_str(ur_platform_backend_t backend) { + for (auto b : backends) { + if (b.second == backend) { + return b.first; + } + } + return "INVALID"; +}; + +ur_platform_backend_t str_to_backend(std::string str) { + + std::transform(str.begin(), str.end(), str.begin(), ::toupper); + for (auto b : backends) { + if (b.first == str) { + return b.second; + } + } + return UR_PLATFORM_BACKEND_UNKNOWN; +}; +} // namespace std::ostream &operator<<(std::ostream &out, const std::vector &platforms) { @@ -42,15 +66,6 @@ std::ostream &operator<<(std::ostream &out, return out; } -std::ostream &operator<<(std::ostream &out, const ur_device_handle_t &device) { - size_t size; - urDeviceGetInfo(device, UR_DEVICE_INFO_NAME, 0, nullptr, &size); - std::vector name(size); - urDeviceGetInfo(device, UR_DEVICE_INFO_NAME, size, name.data(), nullptr); - out << name.data(); - return out; -} - std::ostream &operator<<(std::ostream &out, const std::vector &devices) { for (auto device : devices) { @@ -62,6 +77,10 @@ std::ostream &operator<<(std::ostream &out, uur::PlatformEnvironment::PlatformEnvironment(int argc, char **argv) : platform_options{parsePlatformOptions(argc, argv)} { instance = this; + // Check for errors from parsing platform options + if (!error.empty()) { + return; + } ur_loader_config_handle_t config; if (urLoaderConfigCreate(&config) == UR_RESULT_SUCCESS) { @@ -95,79 +114,119 @@ uur::PlatformEnvironment::PlatformEnvironment(int argc, char **argv) return; } + selectPlatformFromOptions(); +} + +void uur::PlatformEnvironment::selectPlatformFromOptions() { uint32_t adapter_count = 0; urAdapterGet(0, nullptr, &adapter_count); adapters.resize(adapter_count); urAdapterGet(adapter_count, adapters.data(), nullptr); - // Search through the adapters individually so we can store the one we end - // up choosing. + struct platform_info { + ur_adapter_handle_t adapter; + ur_platform_handle_t platform; + std::string name; + ur_platform_backend_t backend; + }; + std::vector platforms; for (auto a : adapters) { uint32_t count = 0; - if (urPlatformGet(&a, 1, 0, nullptr, &count)) { - error = "urPlatformGet() failed to get number of platforms."; - return; - } + ASSERT_SUCCESS(urPlatformGet(&a, 1, 0, nullptr, &count)); + std::vector platform_list(count); + ASSERT_SUCCESS( + urPlatformGet(&a, 1, count, platform_list.data(), nullptr)); - if (count == 0) { - error = "Failed to find any platforms."; - return; - } + for (auto p : platform_list) { + ur_platform_backend_t backend; + ASSERT_SUCCESS(urPlatformGetInfo(p, UR_PLATFORM_INFO_BACKEND, + sizeof(ur_platform_backend_t), + &backend, nullptr)); - std::vector platforms(count); - if (urPlatformGet(&a, 1, count, platforms.data(), nullptr)) { - error = "urPlatformGet failed to get platforms."; - return; + size_t size; + ASSERT_SUCCESS( + urPlatformGetInfo(p, UR_PLATFORM_INFO_NAME, 0, nullptr, &size)); + std::vector platform_name{}; + platform_name.reserve(size); + ASSERT_SUCCESS(urPlatformGetInfo(p, UR_PLATFORM_INFO_NAME, size, + platform_name.data(), nullptr)); + + platforms.push_back(platform_info{ + a, p, std::string(platform_name.data()), backend}); } + } - if (platform_options.platform_name.empty()) { - - if (platforms.size() == 1 || - platform_options.platforms_count == 1) { - platform = platforms[0]; - adapter = a; - } else { - std::stringstream ss_error; - ss_error << "Select a single platform from below using the " - "--platform=NAME " - "command-line option:" - << platforms << std::endl - << "or set --platforms_count=1."; - error = ss_error.str(); - return; - } + std::string default_name{}; + std::map backend_platform_names{}; + auto stream = std::stringstream{platform_options.platform_name}; + for (std::string filter; std::getline(stream, filter, ';');) { + auto split = filter.find(':'); + if (split == std::string::npos) { + default_name = filter; + } else if (split == filter.length() - 1) { + // E.g: `OPENCL:`, ignore it } else { - for (auto candidate : platforms) { - size_t size; - if (urPlatformGetInfo(candidate, UR_PLATFORM_INFO_NAME, 0, - nullptr, &size)) { - error = "urPlatformGetInfoFailed"; - return; - } - std::vector platform_name(size); - if (urPlatformGetInfo(candidate, UR_PLATFORM_INFO_NAME, size, - platform_name.data(), nullptr)) { - error = "urPlatformGetInfo() failed"; - return; - } - if (platform_options.platform_name == platform_name.data()) { - platform = candidate; - adapter = a; - break; - } - } - if (!platform) { - std::stringstream ss_error; - ss_error << "Platform \"" << platform_options.platform_name - << "\" not found. Select a single platform from below " - "using the " - "--platform=NAME command-line options:" - << platforms << std::endl - << "or set --platforms_count=1."; - error = ss_error.str(); - return; - } + backend_platform_names.insert( + {str_to_backend(filter.substr(0, split)), + filter.substr(split + 1)}); + } + } + + std::vector platforms_filtered{}; + std::copy_if(platforms.begin(), platforms.end(), + std::inserter(platforms_filtered, platforms_filtered.begin()), + [&](platform_info info) { + if (!default_name.empty() && default_name != info.name) { + return false; + } + if (backend_platform_names.count(info.backend) && + backend_platform_names[info.backend] != info.name) { + return false; + } + if (platform_options.platform_backend && + platform_options.platform_backend != info.backend) { + return false; + } + return true; + }); + + if (platforms_filtered.size() == 0) { + std::stringstream errstr; + errstr << "No platforms were found with the following filters:"; + if (platform_options.platform_backend) { + errstr << " --backend=" + << backend_to_str(*platform_options.platform_backend); + } + if (!platform_options.platform_name.empty()) { + errstr << " --platform=\"" << platform_options.platform_name + << "\""; } + if (!platform_options.platform_backend && + platform_options.platform_name.empty()) { + errstr << " (none)"; + } + errstr << "\nAvailable platforms:\n"; + for (auto p : platforms) { + errstr << " --backend=" << backend_to_str(p.backend) + << " --platform=\"" << p.name << "\"\n"; + } + FAIL() << errstr.str(); + } else if (platforms_filtered.size() == 1 || + platform_options.platforms_count == 1) { + auto &selected = platforms_filtered[0]; + platform = selected.platform; + adapter = selected.adapter; + std::cerr << "Selected platform: [" << backend_to_str(selected.backend) + << "] " << selected.name << "\n"; + } else if (platforms_filtered.size() > 1) { + std::stringstream errstr; + errstr << "Multiple possible platforms found; please select one of the " + "following or set --platforms_count=1:\n"; + for (auto p : platforms_filtered) { + errstr << " --backend=" << backend_to_str(p.backend) + << " --platform=\"" << p.name << "\"\n"; + } + FAIL() << errstr.str(); } } @@ -196,6 +255,26 @@ void uur::PlatformEnvironment::TearDown() { PlatformEnvironment::PlatformOptions PlatformEnvironment::parsePlatformOptions(int argc, char **argv) { PlatformOptions options{}; + auto parse_backend = [&](std::string backend_string) { + options.platform_backend = str_to_backend(backend_string); + if (options.platform_backend == UR_PLATFORM_BACKEND_UNKNOWN) { + std::stringstream errstr{error}; + errstr << "--backend not valid; expected one of ["; + bool first = true; + for (auto b : backends) { + if (!first) { + errstr << ", "; + } + errstr << b.first; + first = false; + } + errstr << "], but got `" << backend_string << "`"; + error = errstr.str(); + return false; + } + return true; + }; + for (int argi = 1; argi < argc; ++argi) { const char *arg = argv[argi]; if (!(std::strcmp(arg, "-h") && std::strcmp(arg, "--help"))) { @@ -205,6 +284,12 @@ PlatformEnvironment::parsePlatformOptions(int argc, char **argv) { arg, "--platform=", sizeof("--platform=") - 1) == 0) { options.platform_name = std::string(&arg[std::strlen("--platform=")]); + } else if (std::strncmp(arg, "--backend=", sizeof("--backend=") - 1) == + 0) { + std::string backend_string{&arg[std::strlen("--backend=")]}; + if (!parse_backend(backend_string)) { + return options; + } } else if (std::strncmp(arg, "--platforms_count=", sizeof("--platforms_count=") - 1) == 0) { options.platforms_count = std::strtoul( @@ -212,7 +297,7 @@ PlatformEnvironment::parsePlatformOptions(int argc, char **argv) { } } - /* If a platform was not provided using the --platform command line option, + /* If a platform was not provided using the --platform/--backend command line options, * check if environment variable is set to use as a fallback. */ if (options.platform_name.empty()) { auto env_platform = ur_getenv("UR_CTS_ADAPTER_PLATFORM"); @@ -220,6 +305,14 @@ PlatformEnvironment::parsePlatformOptions(int argc, char **argv) { options.platform_name = env_platform.value(); } } + if (!options.platform_backend) { + auto env_backend = ur_getenv("UR_CTS_BACKEND"); + if (env_backend.has_value()) { + if (!parse_backend(env_backend.value())) { + return options; + } + } + } return options; } diff --git a/test/conformance/testing/include/uur/environment.h b/test/conformance/testing/include/uur/environment.h index ec4a39fe7f..bba4c583c8 100644 --- a/test/conformance/testing/include/uur/environment.h +++ b/test/conformance/testing/include/uur/environment.h @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -17,6 +18,7 @@ struct PlatformEnvironment : ::testing::Environment { struct PlatformOptions { std::string platform_name; + std::optional platform_backend; unsigned long platforms_count = 0; }; @@ -26,13 +28,14 @@ struct PlatformEnvironment : ::testing::Environment { virtual void SetUp() override; virtual void TearDown() override; + void selectPlatformFromOptions(); PlatformOptions parsePlatformOptions(int argc, char **argv); + std::string error{}; PlatformOptions platform_options; std::vector adapters{}; ur_adapter_handle_t adapter = nullptr; ur_platform_handle_t platform = nullptr; - std::string error; static PlatformEnvironment *instance; }; diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h index e57a31584a..b853164fb6 100644 --- a/test/conformance/testing/include/uur/fixtures.h +++ b/test/conformance/testing/include/uur/fixtures.h @@ -513,11 +513,12 @@ struct urMultiQueueTest : urContextTest { ur_queue_handle_t queue2 = nullptr; }; -struct urMultiDeviceContextTest : urPlatformTest { +template +struct urMultiDeviceContextTestTemplate : urPlatformTest { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(urPlatformTest::SetUp()); auto &devices = DevicesEnvironment::instance->devices; - if (devices.size() <= 1) { + if (devices.size() < MinDevices) { GTEST_SKIP(); } ASSERT_SUCCESS(urContextCreate(static_cast(devices.size()), @@ -534,6 +535,10 @@ struct urMultiDeviceContextTest : urPlatformTest { ur_context_handle_t context = nullptr; }; +struct urMultiDeviceContextTest : urMultiDeviceContextTestTemplate<> { + using urMultiDeviceContextTestTemplate::context; +}; + struct urMultiDeviceMemBufferTest : urMultiDeviceContextTest { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(urMultiDeviceContextTest::SetUp()); @@ -1215,6 +1220,13 @@ template struct urProgramTestWithParam : urQueueTestWithParam { ur_program_handle_t program = nullptr; }; +// This fixture can provide a kernel, but it doesn't build the kernel at SetUp, +// instead Build() must be invoked separately. This is for tests that wish to +// check device capabilities to determine whether the test should run before +// trying to load any device code. +// +// For a fixture that provides the kernel at SetUp, inherit from urKernelTest +// instead. struct urBaseKernelTest : urProgramTest { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(urProgramTest::SetUp()); @@ -1247,6 +1259,8 @@ struct urKernelTest : urBaseKernelTest { } }; +// Parameterized version of urBaseKernelTest, the comments on that fixture +// clarify why you'd want to use this instead of urKernelTestWithParam. template struct urBaseKernelTestWithParam : urProgramTestWithParam { void SetUp() override { @@ -1300,6 +1314,11 @@ struct KernelLaunchHelper { sizeof(zero), 0, size, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); + SetBuffer1DArg(mem_handle, buffer_index); + *out_buffer = mem_handle; + } + + void SetBuffer1DArg(ur_mem_handle_t mem_handle, size_t *buffer_index) { ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, current_arg_index, nullptr, mem_handle)); if (buffer_index) { @@ -1336,7 +1355,6 @@ struct KernelLaunchHelper { &accessor)); current_arg_index += 2; } - *out_buffer = mem_handle; } template void AddPodArg(T data) { @@ -1381,11 +1399,13 @@ struct KernelLaunchHelper { uint32_t current_arg_index = 0; }; +// Parameterized kernel fixture with execution helpers, for the difference +// between this and urKernelExecutionTestWithParam see the comment on +// urBaseKernelTest. template struct urBaseKernelExecutionTestWithParam : urBaseKernelTestWithParam { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam::SetUp()); - UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam::Build()); } void TearDown() override { @@ -1424,6 +1444,8 @@ struct urBaseKernelExecutionTestWithParam : urBaseKernelTestWithParam { std::vector buffer_args; }; +// Kernel fixture with execution helpers, for the difference between this and +// urKernelExecutionTest see the comment on urBaseKernelTest. struct urBaseKernelExecutionTest : urBaseKernelTest { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTest::SetUp()); diff --git a/test/conformance/testing/include/uur/raii.h b/test/conformance/testing/include/uur/raii.h index e4f456ec36..894a66dfdd 100644 --- a/test/conformance/testing/include/uur/raii.h +++ b/test/conformance/testing/include/uur/raii.h @@ -108,6 +108,12 @@ using Program = Wrapper; using Kernel = Wrapper; using Queue = Wrapper; using Event = Wrapper; +using CommandBuffer = + Wrapper; +using CommandBufferCommand = + Wrapper; } // namespace raii } // namespace uur diff --git a/test/conformance/testing/include/uur/utils.h b/test/conformance/testing/include/uur/utils.h index 7e23e55843..8e2033d8dc 100644 --- a/test/conformance/testing/include/uur/utils.h +++ b/test/conformance/testing/include/uur/utils.h @@ -424,6 +424,77 @@ ur_result_t MakeUSMAllocationByType(ur_usm_type_t type, ur_usm_pool_handle_t hPool, size_t size, void **ppMem); +inline std::tuple +decodeSemVersion(std::string version) { + auto posMajor = version.find('.'); + auto posMinor = version.find('.', posMajor + 1); + auto major = std::stoi(version.substr(0, posMajor)); + auto minor = + std::stoi(version.substr(posMajor + 1, posMinor - posMajor - 1)); + auto patch = std::stoi(version.substr(posMinor + 1)); + return std::make_tuple(major, minor, patch); +} + +inline bool isGivenAdapter(ur_platform_handle_t hPlatform, + std::string adapterName) { + size_t psize; + EXPECT_EQ( + urPlatformGetInfo(hPlatform, UR_PLATFORM_INFO_NAME, 0, nullptr, &psize), + UR_RESULT_SUCCESS); + std::string platform(psize, '\0'); + EXPECT_EQ(urPlatformGetInfo(hPlatform, UR_PLATFORM_INFO_NAME, psize, + platform.data(), nullptr), + UR_RESULT_SUCCESS); + + return platform.find(adapterName) != std::string::npos; +} + +inline std::tuple +getDriverVersion(ur_device_handle_t hDevice) { + size_t driverVersionSize = 0; + EXPECT_EQ(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_DRIVER_VERSION, 0, + nullptr, &driverVersionSize), + UR_RESULT_SUCCESS); + std::string driver(driverVersionSize, '\0'); + EXPECT_EQ(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_DRIVER_VERSION, + driverVersionSize, driver.data(), + &driverVersionSize), + UR_RESULT_SUCCESS); + + return decodeSemVersion(driver); +} + +#define SKIP_IF_DRIVER_TOO_OLD(adapterName, minDriverVersion, hPlatform, \ + hDevice) \ + do { \ + if (uur::isGivenAdapter(hPlatform, adapterName)) { \ + auto [major, minor, patch] = uur::getDriverVersion(hDevice); \ + auto [minMajor, minMinor, minPatch] = minL0DriverVersion; \ + if (major < minMajor || (major == minMajor && minor < minMinor) || \ + (major == minMajor && minor == minMinor && \ + patch < minPatch)) { \ + GTEST_SKIP() \ + << "Skipping test because driver version is too old for " \ + << adapterName << ". " \ + << "Driver version: " << major << "." << minor << "." \ + << patch << " Minimum required version: " << minMajor \ + << "." << minMinor << "." << minPatch; \ + } \ + } \ + } while (0) + +// Is this a Data Center GPU Max series (aka PVC)? +// TODO: change to use +// https://spec.oneapi.io/level-zero/latest/core/api.html#ze-device-ip-version-ext-t +// when that is stable. +static inline bool isPVC(ur_device_handle_t hDevice) { + uint32_t deviceId; + EXPECT_EQ(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_DEVICE_ID, + sizeof(uint32_t), &deviceId, nullptr), + UR_RESULT_SUCCESS); + return (deviceId & 0xff0) == 0xbd0 || (deviceId & 0xff0) == 0xb60; +} + } // namespace uur #endif // UR_CONFORMANCE_INCLUDE_UTILS_H_INCLUDED diff --git a/test/conformance/usm/usm_adapter_cuda.match b/test/conformance/usm/usm_adapter_cuda.match index 15b68f5c6c..a9f7c37b87 100644 --- a/test/conformance/usm/usm_adapter_cuda.match +++ b/test/conformance/usm/usm_adapter_cuda.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} {{OPT}}urUSMDeviceAllocTest.InvalidUSMSize/NVIDIA_CUDA_BACKEND___{{.*}}___UsePoolEnabled {{OPT}}urUSMDeviceAllocTest.InvalidUSMSize/NVIDIA_CUDA_BACKEND___{{.*}}___UsePoolDisabled {{OPT}}urUSMHostAllocTest.InvalidUSMSize/NVIDIA_CUDA_BACKEND___{{.*}}___UsePoolEnabled diff --git a/test/conformance/usm/usm_adapter_hip.match b/test/conformance/usm/usm_adapter_hip.match index 2dfdaf7253..5a1be3c9d4 100644 --- a/test/conformance/usm/usm_adapter_hip.match +++ b/test/conformance/usm/usm_adapter_hip.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} urUSMDeviceAllocTest.Success/AMD_HIP_BACKEND___{{.*}}___UsePoolEnabled urUSMDeviceAllocTest.SuccessWithDescriptors/AMD_HIP_BACKEND___{{.*}}___UsePoolEnabled urUSMDeviceAllocTest.InvalidNullHandleContext/AMD_HIP_BACKEND___{{.*}}___UsePoolEnabled diff --git a/test/conformance/usm/usm_adapter_level_zero-v2.match b/test/conformance/usm/usm_adapter_level_zero-v2.match deleted file mode 100644 index 3490457687..0000000000 --- a/test/conformance/usm/usm_adapter_level_zero-v2.match +++ /dev/null @@ -1,116 +0,0 @@ -urUSMDeviceAllocTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled -urUSMDeviceAllocTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled -urUSMDeviceAllocTest.SuccessWithDescriptors/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled -urUSMDeviceAllocTest.SuccessWithDescriptors/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_4_8 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_4_512 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_4_2048 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_8_8 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_8_512 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_8_2048 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_16_8 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_16_512 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_16_2048 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_32_8 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_32_512 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_32_2048 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_64_8 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_64_512 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_64_2048 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_4_8 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_4_512 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_4_2048 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_8_8 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_8_512 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_8_2048 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_16_8 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_16_512 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_16_2048 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_32_8 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_32_512 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_32_2048 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_64_8 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_64_512 -urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_64_2048 -urUSMFreeTest.SuccessDeviceAlloc/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}} -urUSMFreeTest.SuccessHostAlloc/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}} -urUSMFreeTest.SuccessSharedAlloc/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}} -urUSMGetMemAllocInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_USM_ALLOC_INFO_TYPE -urUSMGetMemAllocInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_USM_ALLOC_INFO_BASE_PTR -urUSMGetMemAllocInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_USM_ALLOC_INFO_SIZE -urUSMGetMemAllocInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_USM_ALLOC_INFO_DEVICE -urUSMGetMemAllocInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_USM_ALLOC_INFO_POOL -urUSMGetMemAllocInfoNegativeTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}} -urUSMGetMemAllocInfoNegativeTest.InvalidNullPointerMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}} -urUSMGetMemAllocInfoNegativeTest.InvalidEnumeration/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}} -urUSMGetMemAllocInfoNegativeTest.InvalidValuePropSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}} -urUSMHostAllocTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled -urUSMHostAllocTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled -urUSMHostAllocTest.SuccessWithDescriptors/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled -urUSMHostAllocTest.SuccessWithDescriptors/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_4_8 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_4_512 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_4_2048 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_8_8 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_8_512 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_8_2048 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_16_8 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_16_512 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_16_2048 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_32_8 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_32_512 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_32_2048 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_64_8 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_64_512 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_64_2048 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_4_8 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_4_512 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_4_2048 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_8_8 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_8_512 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_8_2048 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_16_8 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_16_512 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_16_2048 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_32_8 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_32_512 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_32_2048 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_64_8 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_64_512 -urUSMHostAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_64_2048 -urUSMSharedAllocTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled -urUSMSharedAllocTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled -urUSMSharedAllocTest.SuccessWithDescriptors/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled -urUSMSharedAllocTest.SuccessWithDescriptors/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled -urUSMSharedAllocTest.SuccessWithMultipleAdvices/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled -urUSMSharedAllocTest.SuccessWithMultipleAdvices/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_4_8 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_4_512 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_4_2048 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_8_8 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_8_512 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_8_2048 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_16_8 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_16_512 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_16_2048 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_32_8 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_32_512 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_32_2048 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_64_8 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_64_512 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled_64_2048 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_4_8 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_4_512 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_4_2048 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_8_8 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_8_512 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_8_2048 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_16_8 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_16_512 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_16_2048 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_32_8 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_32_512 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_32_2048 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_64_8 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_64_512 -urUSMSharedAllocAlignmentTest.SuccessAlignedAllocations/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled_64_2048 diff --git a/test/conformance/usm/usm_adapter_level_zero.match b/test/conformance/usm/usm_adapter_level_zero.match index c036fa785c..6f2d5ab1f9 100644 --- a/test/conformance/usm/usm_adapter_level_zero.match +++ b/test/conformance/usm/usm_adapter_level_zero.match @@ -1,2 +1,3 @@ +{{NONDETERMINISTIC}} {{OPT}}urUSMDeviceAllocTest.InvalidUSMSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolEnabled {{OPT}}urUSMDeviceAllocTest.InvalidUSMSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UsePoolDisabled diff --git a/test/conformance/usm/usm_adapter_level_zero_v2.match b/test/conformance/usm/usm_adapter_level_zero_v2.match new file mode 100644 index 0000000000..85f9c4e5c0 --- /dev/null +++ b/test/conformance/usm/usm_adapter_level_zero_v2.match @@ -0,0 +1,8 @@ +{{NONDETERMINISTIC}} +urUSMDeviceAllocTest.InvalidUSMSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}____UsePoolEnabled +urUSMDeviceAllocTest.InvalidUSMSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}____UsePoolDisabled +urUSMGetMemAllocInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}____UR_USM_ALLOC_INFO_POOL +urUSMHostAllocTest.InvalidUSMSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}____UsePoolEnabled +urUSMHostAllocTest.InvalidUSMSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}____UsePoolDisabled +urUSMSharedAllocTest.InvalidUSMSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}____UsePoolEnabled +urUSMSharedAllocTest.InvalidUSMSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}____UsePoolDisabled diff --git a/test/conformance/usm/usm_adapter_native_cpu.match b/test/conformance/usm/usm_adapter_native_cpu.match index 13a0adbdf3..84d214c97f 100644 --- a/test/conformance/usm/usm_adapter_native_cpu.match +++ b/test/conformance/usm/usm_adapter_native_cpu.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} urUSMDeviceAllocTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UsePoolEnabled urUSMDeviceAllocTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UsePoolDisabled urUSMDeviceAllocTest.SuccessWithDescriptors/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UsePoolEnabled diff --git a/test/conformance/usm/usm_adapter_opencl.match b/test/conformance/usm/usm_adapter_opencl.match index cb00bebc57..fbaba92f30 100644 --- a/test/conformance/usm/usm_adapter_opencl.match +++ b/test/conformance/usm/usm_adapter_opencl.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} urUSMDeviceAllocTest.Success/Intel_R__OpenCL___{{.*}}___UsePoolEnabled urUSMDeviceAllocTest.SuccessWithDescriptors/Intel_R__OpenCL___{{.*}}___UsePoolEnabled urUSMDeviceAllocTest.InvalidNullHandleContext/Intel_R__OpenCL___{{.*}}___UsePoolEnabled diff --git a/test/conformance/virtual_memory/virtual_memory_adapter_cuda.match b/test/conformance/virtual_memory/virtual_memory_adapter_cuda.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/virtual_memory/virtual_memory_adapter_hip.match b/test/conformance/virtual_memory/virtual_memory_adapter_hip.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/virtual_memory/virtual_memory_adapter_level_zero-v2.match b/test/conformance/virtual_memory/virtual_memory_adapter_level_zero-v2.match deleted file mode 100644 index 9cda954748..0000000000 --- a/test/conformance/virtual_memory/virtual_memory_adapter_level_zero-v2.match +++ /dev/null @@ -1,10 +0,0 @@ -{{OPT}}urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3 -{{OPT}}urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7 -{{OPT}}urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12 -urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44 -urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1 -urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2 -urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3 -urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7 -urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12 -urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44 diff --git a/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match b/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match index 9cda954748..bf8c7ce279 100644 --- a/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match +++ b/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match @@ -1,3 +1,4 @@ +{{NONDETERMINISTIC}} {{OPT}}urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3 {{OPT}}urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7 {{OPT}}urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12 diff --git a/test/conformance/virtual_memory/virtual_memory_adapter_level_zero_v2.match b/test/conformance/virtual_memory/virtual_memory_adapter_level_zero_v2.match new file mode 100644 index 0000000000..1c83fd1e2a --- /dev/null +++ b/test/conformance/virtual_memory/virtual_memory_adapter_level_zero_v2.match @@ -0,0 +1,84 @@ +{{NONDETERMINISTIC}} +urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1 +urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2 +urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3 +urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7 +urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12 +urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44 +urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1 +urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2 +urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3 +urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7 +urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12 +urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44 +urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1 +urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2 +urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3 +urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7 +urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12 +urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44 +urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1 +urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2 +urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3 +urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7 +urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12 +urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44 +urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1 +urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2 +urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3 +urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7 +urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12 +urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44 +urPhysicalMemReleaseTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urPhysicalMemReleaseTest.InvalidNullHandlePhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urPhysicalMemRetainTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urPhysicalMemRetainTest.InvalidNullHandlePhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urVirtualMemFreeTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urVirtualMemFreeTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urVirtualMemFreeTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urVirtualMemGetInfoTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_VIRTUAL_MEM_INFO_ACCESS_MODE +urVirtualMemGetInfoTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urVirtualMemGetInfoTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urVirtualMemGetInfoTest.InvalidEnumerationInfo/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urVirtualMemGranularityGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM +urVirtualMemGranularityGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_VIRTUAL_MEM_GRANULARITY_INFO_RECOMMENDED +urVirtualMemGranularityGetInfoNegativeTest.InvalidSizePropSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urVirtualMemMapTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urVirtualMemMapTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urVirtualMemMapTest.InvalidNullHandlePhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urVirtualMemMapTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urVirtualMemMapTest.InvalidEnumerationFlags/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2 +urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4 +urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___8 +urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___16 +urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___32 +urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___64 +urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___128 +urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___256 +urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___512 +urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024 +urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2048 +urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___5000 +urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___100000 +urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2 +urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4 +urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___8 +urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___16 +urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___32 +urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___64 +urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___128 +urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___256 +urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___512 +urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024 +urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2048 +urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___5000 +urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___100000 +urVirtualMemReserveTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urVirtualMemReserveTest.InvalidNullPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urVirtualMemSetAccessTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urVirtualMemSetAccessTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urVirtualMemSetAccessTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urVirtualMemUnmapTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urVirtualMemUnmapTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urVirtualMemUnmapTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ diff --git a/test/conformance/virtual_memory/virtual_memory_adapter_native_cpu.match b/test/conformance/virtual_memory/virtual_memory_adapter_native_cpu.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/conformance/virtual_memory/virtual_memory_adapter_opencl.match b/test/conformance/virtual_memory/virtual_memory_adapter_opencl.match deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/layers/CMakeLists.txt b/test/layers/CMakeLists.txt index 2c10a08518..fbf532c274 100644 --- a/test/layers/CMakeLists.txt +++ b/test/layers/CMakeLists.txt @@ -8,3 +8,7 @@ add_subdirectory(validation) if(UR_ENABLE_TRACING) add_subdirectory(tracing) endif() + +if(UR_ENABLE_SANITIZER) + add_subdirectory(sanitizer) +endif() diff --git a/test/layers/sanitizer/CMakeLists.txt b/test/layers/sanitizer/CMakeLists.txt new file mode 100644 index 0000000000..a9601a89c8 --- /dev/null +++ b/test/layers/sanitizer/CMakeLists.txt @@ -0,0 +1,37 @@ +# Copyright (C) 2023-2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +set(UR_SANITIZER_TEST_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(SAN_TEST_PREFIX sanitizer_test) + +function(add_sanitizer_test_executable name) + add_ur_executable(${SAN_TEST_PREFIX}-${name} + ${ARGN}) + target_link_libraries(${SAN_TEST_PREFIX}-${name} + PRIVATE + ${PROJECT_NAME}::loader + ${PROJECT_NAME}::headers + ${PROJECT_NAME}::testing + ${PROJECT_NAME}::mock + GTest::gtest_main) +endfunction() + +function(set_sanitizer_test_properties name) + set_tests_properties(${name} PROPERTIES LABELS "sanitizer") + set_property(TEST ${name} PROPERTY ENVIRONMENT + "UR_LOG_SANITIZER=level:debug\;flush:debug\;output:stdout") +endfunction() + +function(add_sanitizer_test name) + add_sanitizer_test_executable(${name} ${ARGN}) + + add_test(NAME ${name} + COMMAND ${SAN_TEST_PREFIX}-${name} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + + set_sanitizer_test_properties(${name}) +endfunction() + +add_sanitizer_test(asan asan.cpp) diff --git a/test/layers/sanitizer/asan.cpp b/test/layers/sanitizer/asan.cpp new file mode 100644 index 0000000000..0fbfe4cefe --- /dev/null +++ b/test/layers/sanitizer/asan.cpp @@ -0,0 +1,58 @@ +/* + * + * Copyright (C) 2024 Intel Corporation + * + * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. + * See LICENSE.TXT + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * @file asan.cpp + * + */ + +#include +#include + +TEST(DeviceAsan, Initialization) { + ur_result_t status; + + ur_loader_config_handle_t loaderConfig; + status = urLoaderConfigCreate(&loaderConfig); + ASSERT_EQ(status, UR_RESULT_SUCCESS); + status = urLoaderConfigEnableLayer(loaderConfig, "UR_LAYER_ASAN"); + ASSERT_EQ(status, UR_RESULT_SUCCESS); + + status = urLoaderInit(0, loaderConfig); + ASSERT_EQ(status, UR_RESULT_SUCCESS); + + ur_adapter_handle_t adapter; + status = urAdapterGet(1, &adapter, nullptr); + ASSERT_EQ(status, UR_RESULT_SUCCESS); + + ur_platform_handle_t platform; + status = urPlatformGet(&adapter, 1, 1, &platform, nullptr); + ASSERT_EQ(status, UR_RESULT_SUCCESS); + + ur_device_handle_t device; + status = urDeviceGet(platform, UR_DEVICE_TYPE_DEFAULT, 1, &device, nullptr); + ASSERT_EQ(status, UR_RESULT_SUCCESS); + + ur_context_handle_t context; + status = urContextCreate(1, &device, nullptr, &context); + ASSERT_EQ(status, UR_RESULT_SUCCESS); + + status = urContextRelease(context); + ASSERT_EQ(status, UR_RESULT_SUCCESS); + + status = urDeviceRelease(device); + ASSERT_EQ(status, UR_RESULT_SUCCESS); + + status = urAdapterRelease(adapter); + ASSERT_EQ(status, UR_RESULT_SUCCESS); + + status = urLoaderTearDown(); + ASSERT_EQ(status, UR_RESULT_SUCCESS); + + status = urLoaderConfigRelease(loaderConfig); + ASSERT_EQ(status, UR_RESULT_SUCCESS); +} diff --git a/test/layers/tracing/CMakeLists.txt b/test/layers/tracing/CMakeLists.txt index 969e4318b1..2e295c8dba 100644 --- a/test/layers/tracing/CMakeLists.txt +++ b/test/layers/tracing/CMakeLists.txt @@ -15,9 +15,9 @@ target_link_libraries(test_collector PRIVATE ${TARGET_XPTI}) target_include_directories(test_collector PRIVATE ${xpti_SOURCE_DIR}/include) if(MSVC) - target_compile_definitions(test_collector PRIVATE - XPTI_STATIC_LIBRARY XPTI_CALLBACK_API_EXPORTS) + target_compile_definitions(test_collector PRIVATE XPTI_STATIC_LIBRARY) endif() +target_compile_definitions(test_collector PRIVATE XPTI_CALLBACK_API_EXPORTS) function(set_tracing_test_props target_name collector_name) set_tests_properties(${target_name} PROPERTIES @@ -38,7 +38,6 @@ add_test(NAME example-collected-hello-world -D TEST_FILE=$ -D MATCH_FILE=${CMAKE_CURRENT_SOURCE_DIR}/hello_world.out.match -P ${PROJECT_SOURCE_DIR}/cmake/match.cmake - DEPENDS collector hello_world ) set_tracing_test_props(example-collected-hello-world collector) @@ -49,7 +48,6 @@ add_test(NAME example-logged-hello-world -D TEST_FILE=$ -D MATCH_FILE=${CMAKE_CURRENT_SOURCE_DIR}/hello_world.out.logged.match -P ${PROJECT_SOURCE_DIR}/cmake/match.cmake - DEPENDS hello_world ) set_tests_properties(example-logged-hello-world PROPERTIES LABELS "tracing") set_property(TEST example-logged-hello-world PROPERTY ENVIRONMENT @@ -73,7 +71,6 @@ function(add_tracing_test name) -D TEST_FILE=$ -D MATCH_FILE=${CMAKE_CURRENT_SOURCE_DIR}/${name}.out.match -P ${PROJECT_SOURCE_DIR}/cmake/match.cmake - DEPENDS test_collector WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} ) set_tracing_test_props(${name} test_collector) diff --git a/test/layers/tracing/hello_world.out.logged.match b/test/layers/tracing/hello_world.out.logged.match index 99b9cac909..336056dfbc 100644 --- a/test/layers/tracing/hello_world.out.logged.match +++ b/test/layers/tracing/hello_world.out.logged.match @@ -1,13 +1,23 @@ Platform initialized. ----> urAdapterGet(.NumEntries = 0, .phAdapters = {{.*}}, .pNumAdapters = {{.*}} (1)) -> UR_RESULT_SUCCESS; ----> urAdapterGet(.NumEntries = 1, .phAdapters = {{.*}}, .pNumAdapters = nullptr) -> UR_RESULT_SUCCESS; ----> urPlatformGet(.phAdapters = {{.*}}, .NumAdapters = 1, .NumEntries = 1, .phPlatforms = {{.*}}, .pNumPlatforms = {{.*}} (1)) -> UR_RESULT_SUCCESS; ----> urPlatformGet(.phAdapters = {{.*}}, .NumAdapters = 1, .NumEntries = 1, .phPlatforms = {{.*}}, .pNumPlatforms = nullptr) -> UR_RESULT_SUCCESS; ----> urPlatformGetApiVersion(.hPlatform = {{.*}}, .pVersion = {{.*}} ({{0\.[0-9]+}})) -> UR_RESULT_SUCCESS; + ---> urAdapterGet + <--- urAdapterGet(.NumEntries = 0, .phAdapters = {{.*}}, .pNumAdapters = {{.*}} (1)) -> UR_RESULT_SUCCESS; + ---> urAdapterGet + <--- urAdapterGet(.NumEntries = 1, .phAdapters = {{.*}}, .pNumAdapters = nullptr) -> UR_RESULT_SUCCESS; + ---> urPlatformGet + <--- urPlatformGet(.phAdapters = {{.*}}, .NumAdapters = 1, .NumEntries = 1, .phPlatforms = {{.*}}, .pNumPlatforms = {{.*}} (1)) -> UR_RESULT_SUCCESS; + ---> urPlatformGet + <--- urPlatformGet(.phAdapters = {{.*}}, .NumAdapters = 1, .NumEntries = 1, .phPlatforms = {{.*}}, .pNumPlatforms = nullptr) -> UR_RESULT_SUCCESS; + ---> urPlatformGetApiVersion + <--- urPlatformGetApiVersion(.hPlatform = {{.*}}, .pVersion = {{.*}} ({{0\.[0-9]+}})) -> UR_RESULT_SUCCESS; API version: {{0\.[0-9]+}} ----> urDeviceGet(.hPlatform = {{.*}}, .DeviceType = UR_DEVICE_TYPE_GPU, .NumEntries = 0, .phDevices = {}, .pNumDevices = {{.*}} (1)) -> UR_RESULT_SUCCESS; ----> urDeviceGet(.hPlatform = {{.*}}, .DeviceType = UR_DEVICE_TYPE_GPU, .NumEntries = 1, .phDevices = {{.*}}, .pNumDevices = nullptr) -> UR_RESULT_SUCCESS; ----> urDeviceGetInfo(.hDevice = {{.*}}, .propName = UR_DEVICE_INFO_TYPE, .propSize = 4, .pPropValue = {{.*}} (UR_DEVICE_TYPE_GPU), .pPropSizeRet = nullptr) -> UR_RESULT_SUCCESS; ----> urDeviceGetInfo(.hDevice = {{.*}}, .propName = UR_DEVICE_INFO_NAME, .propSize = {{.*}}, .pPropValue = {{.*}} (Mock Device), .pPropSizeRet = nullptr) -> UR_RESULT_SUCCESS; + ---> urDeviceGet + <--- urDeviceGet(.hPlatform = {{.*}}, .DeviceType = UR_DEVICE_TYPE_GPU, .NumEntries = 0, .phDevices = {}, .pNumDevices = {{.*}} (1)) -> UR_RESULT_SUCCESS; + ---> urDeviceGet + <--- urDeviceGet(.hPlatform = {{.*}}, .DeviceType = UR_DEVICE_TYPE_GPU, .NumEntries = 1, .phDevices = {{.*}}, .pNumDevices = nullptr) -> UR_RESULT_SUCCESS; + ---> urDeviceGetInfo + <--- urDeviceGetInfo(.hDevice = {{.*}}, .propName = UR_DEVICE_INFO_TYPE, .propSize = 4, .pPropValue = {{.*}} (UR_DEVICE_TYPE_GPU), .pPropSizeRet = nullptr) -> UR_RESULT_SUCCESS; + ---> urDeviceGetInfo + <--- urDeviceGetInfo(.hDevice = {{.*}}, .propName = UR_DEVICE_INFO_NAME, .propSize = {{.*}}, .pPropValue = {{.*}} (Mock Device), .pPropSizeRet = nullptr) -> UR_RESULT_SUCCESS; Found a Mock Device gpu. ----> urAdapterRelease(.hAdapter = {{.*}}) -> UR_RESULT_SUCCESS; + ---> urAdapterRelease + <--- urAdapterRelease(.hAdapter = {{.*}}) -> UR_RESULT_SUCCESS; diff --git a/test/layers/tracing/test_collector.cpp b/test/layers/tracing/test_collector.cpp index 6c942c63ec..2e412427a7 100644 --- a/test/layers/tracing/test_collector.cpp +++ b/test/layers/tracing/test_collector.cpp @@ -25,7 +25,7 @@ constexpr uint16_t TRACE_FN_BEGIN = static_cast(xpti::trace_point_type_t::function_with_args_begin); constexpr uint16_t TRACE_FN_END = static_cast(xpti::trace_point_type_t::function_with_args_end); -constexpr std::string_view UR_STREAM_NAME = "ur"; +constexpr std::string_view UR_STREAM_NAME = "ur.call"; XPTI_CALLBACK_API void trace_cb(uint16_t trace_type, xpti::trace_event_data_t *, xpti::trace_event_data_t *child, uint64_t, diff --git a/test/layers/validation/fixtures.hpp b/test/layers/validation/fixtures.hpp index 9e261f0a1d..e0329f667e 100644 --- a/test/layers/validation/fixtures.hpp +++ b/test/layers/validation/fixtures.hpp @@ -128,12 +128,12 @@ struct valAllDevicesTest : valPlatformTest { // We use this to avoid segfaults in the mock adapter when we're doing stuff // like double releases in the leak detection tests. -inline ur_result_t genericSuccessCallback(void *) { return UR_RESULT_SUCCESS; }; +inline ur_result_t genericSuccessCallback(void *) { return UR_RESULT_SUCCESS; } // This returns valid (non-null) handles that we can safely leak. inline ur_result_t fakeContext_urContextCreate(void *pParams) { static std::atomic_int handle = 42; - auto params = *static_cast(pParams); + const auto ¶ms = *static_cast(pParams); // There are two casts because windows doesn't implicitly extend the 32 bit // result of atomic_int::operator++. **params.pphContext = diff --git a/test/layers/validation/leaks.cpp b/test/layers/validation/leaks.cpp index 59b6bdb750..cd4fc4b739 100644 --- a/test/layers/validation/leaks.cpp +++ b/test/layers/validation/leaks.cpp @@ -9,7 +9,7 @@ // We need a fake handle for the below adapter leak test. inline ur_result_t fakeAdapter_urAdapterGet(void *pParams) { - auto params = *static_cast(pParams); + const auto ¶ms = *static_cast(pParams); **params.pphAdapters = reinterpret_cast(0x1); return UR_RESULT_SUCCESS; } diff --git a/test/loader/CMakeLists.txt b/test/loader/CMakeLists.txt index 692a5f5d1d..3f1a06448b 100644 --- a/test/loader/CMakeLists.txt +++ b/test/loader/CMakeLists.txt @@ -3,7 +3,7 @@ # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -add_test(NAME example-hello-world COMMAND hello_world DEPENDS hello_world) +add_test(NAME example-hello-world COMMAND hello_world) set_tests_properties(example-hello-world PROPERTIES LABELS "loader" ENVIRONMENT "UR_ADAPTERS_FORCE_LOAD=\"$\"" ) diff --git a/test/loader/adapter_registry/CMakeLists.txt b/test/loader/adapter_registry/CMakeLists.txt index 2778ad5c40..6d80430e6c 100644 --- a/test/loader/adapter_registry/CMakeLists.txt +++ b/test/loader/adapter_registry/CMakeLists.txt @@ -51,3 +51,7 @@ add_adapter_reg_search_test(search-order SEARCH_PATH ${TEST_SEARCH_PATH} ENVS "TEST_ADAPTER_SEARCH_PATH=\"${TEST_SEARCH_PATH}\"" "TEST_CUR_SEARCH_PATH=\"${TEST_BIN_PATH}\"" SOURCES search_order.cpp) + +add_adapter_reg_search_test(prefilter + SEARCH_PATH "" + SOURCES prefilter.cpp) diff --git a/test/loader/adapter_registry/fixtures.hpp b/test/loader/adapter_registry/fixtures.hpp index 79a831d40f..da5c963e8a 100644 --- a/test/loader/adapter_registry/fixtures.hpp +++ b/test/loader/adapter_registry/fixtures.hpp @@ -74,5 +74,49 @@ struct adapterRegSearchTest : ::testing::Test { } } }; +#ifndef _WIN32 +struct adapterPreFilterTest : ::testing::Test { + ur_loader::AdapterRegistry *registry; + const fs::path levelzeroLibName = + MAKE_LIBRARY_NAME("ur_adapter_level_zero", "0"); + std::function islevelzeroLibName = + [this](const fs::path &path) { return path == levelzeroLibName; }; + + std::function &)> haslevelzeroLibName = + [this](const std::vector &paths) { + return std::any_of(paths.cbegin(), paths.cend(), + islevelzeroLibName); + }; + + const fs::path openclLibName = MAKE_LIBRARY_NAME("ur_adapter_opencl", "0"); + std::function isOpenclLibName = + [this](const fs::path &path) { return path == openclLibName; }; + + std::function &)> hasOpenclLibName = + [this](const std::vector &paths) { + return std::any_of(paths.cbegin(), paths.cend(), isOpenclLibName); + }; + + const fs::path cudaLibName = MAKE_LIBRARY_NAME("ur_adapter_cuda", "0"); + std::function isCudaLibName = + [this](const fs::path &path) { return path == cudaLibName; }; + + std::function &)> hasCudaLibName = + [this](const std::vector &paths) { + return std::any_of(paths.cbegin(), paths.cend(), isCudaLibName); + }; + + void SetUp(std::string filter) { + try { + setenv("ONEAPI_DEVICE_SELECTOR", filter.c_str(), 1); + registry = new ur_loader::AdapterRegistry; + } catch (const std::invalid_argument &e) { + FAIL() << e.what(); + } + } + void SetUp() override {} + void TearDown() override { delete registry; } +}; +#endif #endif // UR_ADAPTER_REG_TEST_HELPERS_H diff --git a/test/loader/adapter_registry/prefilter.cpp b/test/loader/adapter_registry/prefilter.cpp new file mode 100644 index 0000000000..1d2b095da3 --- /dev/null +++ b/test/loader/adapter_registry/prefilter.cpp @@ -0,0 +1,140 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "fixtures.hpp" + +#ifndef _WIN32 + +TEST_F(adapterPreFilterTest, testPrefilterAcceptFilterSingleBackend) { + SetUp("level_zero:*"); + auto levelZeroExists = + std::any_of(registry->cbegin(), registry->cend(), haslevelzeroLibName); + EXPECT_TRUE(levelZeroExists); + auto openclExists = + std::any_of(registry->cbegin(), registry->cend(), hasOpenclLibName); + EXPECT_FALSE(openclExists); + auto cudaExists = + std::any_of(registry->cbegin(), registry->cend(), hasCudaLibName); + EXPECT_FALSE(cudaExists); +} + +TEST_F(adapterPreFilterTest, testPrefilterAcceptFilterMultipleBackends) { + SetUp("level_zero:*;opencl:*"); + auto levelZeroExists = + std::any_of(registry->cbegin(), registry->cend(), haslevelzeroLibName); + EXPECT_TRUE(levelZeroExists); + auto openclExists = + std::any_of(registry->cbegin(), registry->cend(), hasOpenclLibName); + EXPECT_TRUE(openclExists); + auto cudaExists = + std::any_of(registry->cbegin(), registry->cend(), hasCudaLibName); + EXPECT_FALSE(cudaExists); +} + +TEST_F(adapterPreFilterTest, testPrefilterDiscardFilterSingleBackend) { + SetUp("!level_zero:*"); + auto levelZeroExists = + std::any_of(registry->cbegin(), registry->cend(), haslevelzeroLibName); + EXPECT_FALSE(levelZeroExists); + auto openclExists = + std::any_of(registry->cbegin(), registry->cend(), hasOpenclLibName); + EXPECT_TRUE(openclExists); + auto cudaExists = + std::any_of(registry->cbegin(), registry->cend(), hasCudaLibName); + EXPECT_TRUE(cudaExists); +} + +TEST_F(adapterPreFilterTest, testPrefilterDiscardFilterMultipleBackends) { + SetUp("!level_zero:*;!cuda:*"); + auto levelZeroExists = + std::any_of(registry->cbegin(), registry->cend(), haslevelzeroLibName); + EXPECT_FALSE(levelZeroExists); + auto openclExists = + std::any_of(registry->cbegin(), registry->cend(), hasOpenclLibName); + EXPECT_TRUE(openclExists); + auto cudaExists = + std::any_of(registry->cbegin(), registry->cend(), hasCudaLibName); + EXPECT_FALSE(cudaExists); +} + +TEST_F(adapterPreFilterTest, testPrefilterAcceptAndDiscardFilter) { + SetUp("!cuda:*;level_zero:*"); + auto levelZeroExists = + std::any_of(registry->cbegin(), registry->cend(), haslevelzeroLibName); + EXPECT_TRUE(levelZeroExists); + auto openclExists = + std::any_of(registry->cbegin(), registry->cend(), hasOpenclLibName); + EXPECT_FALSE(openclExists); + auto cudaExists = + std::any_of(registry->cbegin(), registry->cend(), hasCudaLibName); + EXPECT_FALSE(cudaExists); +} + +TEST_F(adapterPreFilterTest, testPrefilterDiscardFilterAll) { + SetUp("*"); + auto levelZeroExists = + std::any_of(registry->cbegin(), registry->cend(), haslevelzeroLibName); + EXPECT_TRUE(levelZeroExists); + auto openclExists = + std::any_of(registry->cbegin(), registry->cend(), hasOpenclLibName); + EXPECT_TRUE(openclExists); + auto cudaExists = + std::any_of(registry->cbegin(), registry->cend(), hasCudaLibName); + EXPECT_TRUE(cudaExists); +} + +TEST_F(adapterPreFilterTest, testPrefilterWithInvalidMissingBackend) { + SetUp(":garbage"); + auto levelZeroExists = + std::any_of(registry->cbegin(), registry->cend(), haslevelzeroLibName); + EXPECT_TRUE(levelZeroExists); + auto openclExists = + std::any_of(registry->cbegin(), registry->cend(), hasOpenclLibName); + EXPECT_TRUE(openclExists); + auto cudaExists = + std::any_of(registry->cbegin(), registry->cend(), hasCudaLibName); + EXPECT_TRUE(cudaExists); +} + +TEST_F(adapterPreFilterTest, testPrefilterWithInvalidBackend) { + SetUp("garbage:0"); + auto levelZeroExists = + std::any_of(registry->cbegin(), registry->cend(), haslevelzeroLibName); + EXPECT_TRUE(levelZeroExists); + auto openclExists = + std::any_of(registry->cbegin(), registry->cend(), hasOpenclLibName); + EXPECT_TRUE(openclExists); + auto cudaExists = + std::any_of(registry->cbegin(), registry->cend(), hasCudaLibName); + EXPECT_TRUE(cudaExists); +} + +TEST_F(adapterPreFilterTest, testPrefilterWithNotAllAndAcceptFilter) { + SetUp("!*;level_zero"); + auto levelZeroExists = + std::any_of(registry->cbegin(), registry->cend(), haslevelzeroLibName); + EXPECT_TRUE(levelZeroExists); + auto openclExists = + std::any_of(registry->cbegin(), registry->cend(), hasOpenclLibName); + EXPECT_FALSE(openclExists); + auto cudaExists = + std::any_of(registry->cbegin(), registry->cend(), hasCudaLibName); + EXPECT_FALSE(cudaExists); +} + +TEST_F(adapterPreFilterTest, testPrefilterWithNotAllFilter) { + SetUp("!*"); + auto levelZeroExists = + std::any_of(registry->cbegin(), registry->cend(), haslevelzeroLibName); + EXPECT_FALSE(levelZeroExists); + auto openclExists = + std::any_of(registry->cbegin(), registry->cend(), hasOpenclLibName); + EXPECT_FALSE(openclExists); + auto cudaExists = + std::any_of(registry->cbegin(), registry->cend(), hasCudaLibName); + EXPECT_FALSE(cudaExists); +} + +#endif diff --git a/test/loader/handles/fixtures.hpp b/test/loader/handles/fixtures.hpp index 8044c90414..441433d899 100644 --- a/test/loader/handles/fixtures.hpp +++ b/test/loader/handles/fixtures.hpp @@ -15,7 +15,7 @@ #endif ur_result_t replace_urPlatformGet(void *pParams) { - auto params = *static_cast(pParams); + const auto ¶ms = *static_cast(pParams); if (*params.ppNumPlatforms) { **params.ppNumPlatforms = 1; @@ -29,7 +29,7 @@ ur_result_t replace_urPlatformGet(void *pParams) { } ur_result_t replace_urDeviceGetInfo(void *pParams) { - auto params = *static_cast(pParams); + const auto ¶ms = *static_cast(pParams); if (*params.ppropName == UR_DEVICE_INFO_PLATFORM) { if (*params.ppPropSizeRet) { **params.ppPropSizeRet = sizeof(ur_platform_handle_t); diff --git a/test/loader/loader_config/CMakeLists.txt b/test/loader/loader_config/CMakeLists.txt index db07bec990..9a1c306a58 100644 --- a/test/loader/loader_config/CMakeLists.txt +++ b/test/loader/loader_config/CMakeLists.txt @@ -9,6 +9,7 @@ add_ur_executable(test-loader-config urLoaderConfigEnableLayer.cpp urLoaderConfigRelease.cpp urLoaderConfigRetain.cpp + urLoaderConfigSetCodeLocationCallback.cpp ) target_link_libraries(test-loader-config diff --git a/test/loader/loader_config/urLoaderConfigCreate.cpp b/test/loader/loader_config/urLoaderConfigCreate.cpp index df1293661b..9dc52dcda2 100644 --- a/test/loader/loader_config/urLoaderConfigCreate.cpp +++ b/test/loader/loader_config/urLoaderConfigCreate.cpp @@ -17,6 +17,7 @@ struct LoaderConfigCreateTest : ::testing::Test { TEST_F(LoaderConfigCreateTest, Success) { ASSERT_SUCCESS(urLoaderConfigCreate(&loaderConfig)); + ASSERT_TRUE(loaderConfig != nullptr); } TEST_F(LoaderConfigCreateTest, InvalidNullPointerLoaderConfig) { diff --git a/test/loader/loader_config/urLoaderConfigGetInfo.cpp b/test/loader/loader_config/urLoaderConfigGetInfo.cpp index 1985e57060..cebc37d91e 100644 --- a/test/loader/loader_config/urLoaderConfigGetInfo.cpp +++ b/test/loader/loader_config/urLoaderConfigGetInfo.cpp @@ -5,7 +5,9 @@ #include "fixtures.hpp" -struct urLoaderConfigGetInfoTest +#include + +struct urLoaderConfigGetInfoWithParamTest : LoaderConfigTest, ::testing::WithParamInterface { void SetUp() override { @@ -23,30 +25,111 @@ struct urLoaderConfigGetInfoTest }; INSTANTIATE_TEST_SUITE_P( - , urLoaderConfigGetInfoTest, + , urLoaderConfigGetInfoWithParamTest, ::testing::Values(UR_LOADER_CONFIG_INFO_AVAILABLE_LAYERS, UR_LOADER_CONFIG_INFO_REFERENCE_COUNT)); -TEST_P(urLoaderConfigGetInfoTest, Success) { +TEST_P(urLoaderConfigGetInfoWithParamTest, Success) { ASSERT_SUCCESS(urLoaderConfigGetInfo(loaderConfig, infoType, infoSize, infoAllocation.data(), nullptr)); } -TEST_P(urLoaderConfigGetInfoTest, InvalidNullHandleLoaderConfig) { +TEST_P(urLoaderConfigGetInfoWithParamTest, InvalidNullHandleLoaderConfig) { ASSERT_EQ(UR_RESULT_ERROR_INVALID_NULL_HANDLE, urLoaderConfigGetInfo(nullptr, infoType, infoSize, infoAllocation.data(), nullptr)); } -TEST_P(urLoaderConfigGetInfoTest, InvalidNullPointer) { +TEST_P(urLoaderConfigGetInfoWithParamTest, InvalidNullPointer) { + ASSERT_EQ( + UR_RESULT_ERROR_INVALID_NULL_POINTER, + urLoaderConfigGetInfo(loaderConfig, infoType, 1, nullptr, nullptr)); + ASSERT_EQ( UR_RESULT_ERROR_INVALID_NULL_POINTER, urLoaderConfigGetInfo(loaderConfig, infoType, 0, nullptr, nullptr)); } -TEST_P(urLoaderConfigGetInfoTest, InvalidEnumerationInfoType) { +TEST_P(urLoaderConfigGetInfoWithParamTest, InvalidEnumerationInfoType) { ASSERT_EQ(UR_RESULT_ERROR_INVALID_ENUMERATION, urLoaderConfigGetInfo(loaderConfig, UR_LOADER_CONFIG_INFO_FORCE_UINT32, 0, nullptr, &infoSize)); } + +TEST_P(urLoaderConfigGetInfoWithParamTest, InvalidSize) { + ASSERT_EQ(UR_RESULT_ERROR_INVALID_SIZE, + urLoaderConfigGetInfo(loaderConfig, infoType, 0, + infoAllocation.data(), &infoSize)); + + ASSERT_EQ(UR_RESULT_ERROR_INVALID_SIZE, + urLoaderConfigGetInfo(loaderConfig, infoType, infoSize - 1, + infoAllocation.data(), &infoSize)); +} + +using urLoaderConfigGetInfoTest = LoaderConfigTest; + +TEST_F(urLoaderConfigGetInfoTest, ReferenceCountNonZero) { + uint32_t referenceCount = 0; + ASSERT_SUCCESS(urLoaderConfigGetInfo( + loaderConfig, UR_LOADER_CONFIG_INFO_REFERENCE_COUNT, + sizeof(referenceCount), &referenceCount, nullptr)); + ASSERT_GT(referenceCount, 0); +} + +std::vector splitString(const std::string &str, char delimiter) { + std::vector tokens; + std::stringstream ss(str); + std::string token; + while (std::getline(ss, token, delimiter)) { + tokens.push_back(token); + } + return tokens; +} + +bool isLayerStringValid(std::string &layersString, + const std::vector &validLayers) { + if (layersString.empty()) { + return true; + } + + layersString.pop_back(); // remove null terminator before comparing + std::vector layers = splitString(layersString, ';'); + + for (const std::string &layer : layers) { + if (std::find(validLayers.begin(), validLayers.end(), layer) == + validLayers.end()) { + return false; + } + } + + return true; +} + +TEST_F(urLoaderConfigGetInfoTest, ValidLayersList) { + std::vector layerNames{ + "UR_LAYER_PARAMETER_VALIDATION", + "UR_LAYER_BOUNDS_CHECKING", + "UR_LAYER_LEAK_CHECKING", + "UR_LAYER_LIFETIME_VALIDATION", + "UR_LAYER_FULL_VALIDATION", + "UR_LAYER_TRACING", + "UR_LAYER_ASAN", + "UR_LAYER_MSAN", + "UR_LAYER_TSAN", + }; + + std::string availableLayers; + size_t availableLayersLength = 0; + + ASSERT_SUCCESS(urLoaderConfigGetInfo(loaderConfig, + UR_LOADER_CONFIG_INFO_AVAILABLE_LAYERS, + 0, nullptr, &availableLayersLength)); + + availableLayers.resize(availableLayersLength); + ASSERT_SUCCESS(urLoaderConfigGetInfo( + loaderConfig, UR_LOADER_CONFIG_INFO_AVAILABLE_LAYERS, + availableLayersLength, availableLayers.data(), nullptr)); + + ASSERT_TRUE(isLayerStringValid(availableLayers, layerNames)); +} diff --git a/test/loader/loader_config/urLoaderConfigSetCodeLocationCallback.cpp b/test/loader/loader_config/urLoaderConfigSetCodeLocationCallback.cpp new file mode 100644 index 0000000000..a28532f5d1 --- /dev/null +++ b/test/loader/loader_config/urLoaderConfigSetCodeLocationCallback.cpp @@ -0,0 +1,35 @@ +// Copyright (C) 2023 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "fixtures.hpp" + +ur_code_location_t codeLocationCallback([[maybe_unused]] void *userData) { + ur_code_location_t codeloc; + codeloc.columnNumber = 1; + codeloc.lineNumber = 2; + codeloc.functionName = "fname"; + codeloc.sourceFile = "sfile"; + + return codeloc; +} + +struct urLoaderConfigSetCodeLocationCallbackTest : LoaderConfigTest {}; + +TEST_F(urLoaderConfigSetCodeLocationCallbackTest, Success) { + ASSERT_SUCCESS(urLoaderConfigSetCodeLocationCallback( + loaderConfig, codeLocationCallback, nullptr)); +} + +TEST_F(urLoaderConfigSetCodeLocationCallbackTest, InvalidNullHandle) { + ASSERT_EQ(urLoaderConfigSetCodeLocationCallback( + nullptr, codeLocationCallback, nullptr), + UR_RESULT_ERROR_INVALID_NULL_HANDLE); +} + +TEST_F(urLoaderConfigSetCodeLocationCallbackTest, InvalidNullPointer) { + ASSERT_EQ( + urLoaderConfigSetCodeLocationCallback(loaderConfig, nullptr, nullptr), + UR_RESULT_ERROR_INVALID_NULL_POINTER); +} diff --git a/test/loader/platforms/CMakeLists.txt b/test/loader/platforms/CMakeLists.txt index 92e74856e7..2ff9060b9c 100644 --- a/test/loader/platforms/CMakeLists.txt +++ b/test/loader/platforms/CMakeLists.txt @@ -25,7 +25,6 @@ function(add_loader_platform_test name ENV) -D MODE=stdout -D MATCH_FILE=${CMAKE_CURRENT_SOURCE_DIR}/${name}.match -P ${PROJECT_SOURCE_DIR}/cmake/match.cmake - DEPENDS test-loader-platforms ur_adapter_mock ) set_tests_properties(${TEST_NAME} PROPERTIES LABELS "loader" diff --git a/test/tools/urtrace/CMakeLists.txt b/test/tools/urtrace/CMakeLists.txt index 629982898e..a16f369c29 100644 --- a/test/tools/urtrace/CMakeLists.txt +++ b/test/tools/urtrace/CMakeLists.txt @@ -19,7 +19,6 @@ function(add_trace_test name CLI_ARGS) -D MODE=stdout -D MATCH_FILE=${CMAKE_CURRENT_BINARY_DIR}/${name}.match -P ${PROJECT_SOURCE_DIR}/cmake/match.cmake - DEPENDS ur_trace_cli hello_world ) set_tests_properties(${TEST_NAME} PROPERTIES LABELS "urtrace") endfunction() diff --git a/test/unit/utils/CMakeLists.txt b/test/unit/utils/CMakeLists.txt index a0e0fd3ef7..62681b1032 100644 --- a/test/unit/utils/CMakeLists.txt +++ b/test/unit/utils/CMakeLists.txt @@ -13,3 +13,6 @@ add_unit_test(params add_unit_test(print print.cpp) + +add_unit_test(helpers + helpers.cpp) diff --git a/test/unit/utils/helpers.cpp b/test/unit/utils/helpers.cpp new file mode 100644 index 0000000000..87223b21cc --- /dev/null +++ b/test/unit/utils/helpers.cpp @@ -0,0 +1,30 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include + +#include "ur_util.hpp" + +TEST(groupDigits, Success) { + EXPECT_EQ(groupDigits(-1), "-1"); + EXPECT_EQ(groupDigits(-12), "-12"); + EXPECT_EQ(groupDigits(-123), "-123"); + EXPECT_EQ(groupDigits(-1234), "-1'234"); + EXPECT_EQ(groupDigits(-12345), "-12'345"); + EXPECT_EQ(groupDigits(-123456), "-123'456"); + EXPECT_EQ(groupDigits(-1234567), "-1'234'567"); + EXPECT_EQ(groupDigits(-12345678), "-12'345'678"); + + EXPECT_EQ(groupDigits(0), "0"); + EXPECT_EQ(groupDigits(1), "1"); + EXPECT_EQ(groupDigits(12), "12"); + EXPECT_EQ(groupDigits(123), "123"); + EXPECT_EQ(groupDigits(1234), "1'234"); + EXPECT_EQ(groupDigits(12345), "12'345"); + EXPECT_EQ(groupDigits(123456), "123'456"); + EXPECT_EQ(groupDigits(1234567), "1'234'567"); + EXPECT_EQ(groupDigits(12345678), "12'345'678"); +} diff --git a/test/unit/utils/params.cpp b/test/unit/utils/params.cpp index c456f69795..e86181344c 100644 --- a/test/unit/utils/params.cpp +++ b/test/unit/utils/params.cpp @@ -27,3 +27,17 @@ TEST(PrintPtr, nested_void_ptrs) { ur::details::printPtr(out, pppreal); EXPECT_THAT(out.str(), MatchesRegex(".+ \\(.+ \\(.+ \\(.+\\)\\)\\)")); } + +TEST(PrintBool, False) { + ur_bool_t value = false; + std::ostringstream out; + out << value; + EXPECT_STREQ(out.str().data(), "false"); +} + +TEST(PrintBool, True) { + ur_bool_t value = 1; + std::ostringstream out; + out << value; + EXPECT_STREQ(out.str().data(), "true"); +} diff --git a/test/usm/CMakeLists.txt b/test/usm/CMakeLists.txt index 1e3d3eb78d..4d0d459bd8 100644 --- a/test/usm/CMakeLists.txt +++ b/test/usm/CMakeLists.txt @@ -15,6 +15,7 @@ function(add_usm_test name) PRIVATE ${PROJECT_NAME}::common ${PROJECT_NAME}::loader + ${PROJECT_NAME}::umf ur_testing GTest::gtest_main) add_test(NAME usm-${name} diff --git a/test/usm/usmPoolManager.cpp b/test/usm/usmPoolManager.cpp index b463f90210..bdcf402ac2 100644 --- a/test/usm/usmPoolManager.cpp +++ b/test/usm/usmPoolManager.cpp @@ -76,7 +76,7 @@ TEST_P(urUsmPoolManagerTest, poolManagerPopulate) { // Populate the pool manager auto poolUnique = createMockPoolHandle(); ASSERT_NE(poolUnique, nullptr); - ret = manager.addPool(desc, poolUnique); + ret = manager.addPool(desc, std::move(poolUnique)); ASSERT_EQ(ret, UR_RESULT_SUCCESS); } @@ -97,11 +97,11 @@ TEST_P(urUsmPoolManagerTest, poolManagerInsertExisting) { auto poolUnique = createMockPoolHandle(); ASSERT_NE(poolUnique, nullptr); - ret = manager.addPool(desc, poolUnique); + ret = manager.addPool(desc, std::move(poolUnique)); ASSERT_EQ(ret, UR_RESULT_SUCCESS); // Inserting an existing key should return an error - ret = manager.addPool(desc, poolUnique); + ret = manager.addPool(desc, createMockPoolHandle()); ASSERT_EQ(ret, UR_RESULT_ERROR_INVALID_ARGUMENT); } diff --git a/third_party/deps.yml b/third_party/deps.yml index 80c2fa109c..69e2222bcf 100644 --- a/third_party/deps.yml +++ b/third_party/deps.yml @@ -25,7 +25,7 @@ dependencies: - libssh2=1.11.0 - libstdcxx-ng=13.1.0 - libuv=1.44.2 - - libzlib=1.2.13 + - libzlib=1.3.1 - llvm-spirv=14.0.0 - llvm-tools=14.0.6 - llvmdev=14.0.6 @@ -35,5 +35,5 @@ dependencies: - rhash=1.4.3 # don't upgrade xz utils due to CVE-2024-3094 - xz=5.2.6 - - zlib=1.2.13 + - zlib=1.3.1 - zstd=1.5.2 diff --git a/third_party/requirements.txt b/third_party/requirements.txt index 330cbd023b..3505fa52ce 100644 --- a/third_party/requirements.txt +++ b/third_party/requirements.txt @@ -4,7 +4,7 @@ bandit==1.6.2 beautifulsoup4==4.11.1 breathe==4.33.1 bs4==0.0.1 -certifi==2023.07.22 +certifi==2024.07.04 chardet==3.0.4 clang-format==15.0.7 colorama==0.4.1 @@ -22,8 +22,8 @@ pyparsing==2.4.5 pytest>=7.0 pytz==2019.3 PyYAML==6.0.1 -requests==2.31.0 -rst2pdf==0.98 +requests==2.32.2 +rst2pdf==0.102 six==1.13.0 snowballstemmer==2.0.0 soupsieve==1.9.5 @@ -37,5 +37,5 @@ sphinxcontrib-qthelp==1.0.3 sphinxcontrib-serializinghtml==1.1.5 sphinxcontrib-websupport==1.2.4 sphinx-rtd-theme==1.0.0 -urllib3==2.1.0 +urllib3==2.2.2 dataclasses-json==0.6.7 diff --git a/tools/urinfo/urinfo.hpp b/tools/urinfo/urinfo.hpp index f483dd3983..e4e0cdb696 100644 --- a/tools/urinfo/urinfo.hpp +++ b/tools/urinfo/urinfo.hpp @@ -334,8 +334,11 @@ inline void printDeviceInfos(ur_device_handle_t hDevice, printDeviceInfo(hDevice, UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP); std::cout << prefix; - printDeviceInfo( - hDevice, UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP); + printDeviceInfo( + hDevice, UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP); + std::cout << prefix; + printDeviceInfo(hDevice, + UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP); std::cout << prefix; printDeviceInfo(hDevice, UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP); std::cout << prefix; @@ -372,10 +375,10 @@ inline void printDeviceInfos(ur_device_handle_t hDevice, hDevice, UR_DEVICE_INFO_MIPMAP_LEVEL_REFERENCE_SUPPORT_EXP); std::cout << prefix; printDeviceInfo( - hDevice, UR_DEVICE_INFO_INTEROP_MEMORY_IMPORT_SUPPORT_EXP); + hDevice, UR_DEVICE_INFO_EXTERNAL_MEMORY_IMPORT_SUPPORT_EXP); std::cout << prefix; printDeviceInfo( - hDevice, UR_DEVICE_INFO_INTEROP_SEMAPHORE_IMPORT_SUPPORT_EXP); + hDevice, UR_DEVICE_INFO_EXTERNAL_SEMAPHORE_IMPORT_SUPPORT_EXP); std::cout << prefix; printDeviceInfo(hDevice, UR_DEVICE_INFO_CUBEMAP_SUPPORT_EXP); std::cout << prefix; diff --git a/tools/urtrace/CMakeLists.txt b/tools/urtrace/CMakeLists.txt index 085f361223..9b385606ea 100644 --- a/tools/urtrace/CMakeLists.txt +++ b/tools/urtrace/CMakeLists.txt @@ -17,9 +17,9 @@ target_link_libraries(${TARGET_NAME} PRIVATE ${TARGET_XPTI} ${PROJECT_NAME}::com target_include_directories(${TARGET_NAME} PRIVATE ${xpti_SOURCE_DIR}/include) if(MSVC) - target_compile_definitions(${TARGET_NAME} PRIVATE - XPTI_STATIC_LIBRARY XPTI_CALLBACK_API_EXPORTS) + target_compile_definitions(${TARGET_NAME} PRIVATE XPTI_STATIC_LIBRARY) endif() +target_compile_definitions(${TARGET_NAME} PRIVATE XPTI_CALLBACK_API_EXPORTS) set(UR_TRACE_CLI_BIN ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/urtrace) diff --git a/tools/urtrace/collector.cpp b/tools/urtrace/collector.cpp index 766e7c9dfe..eb8c18d164 100644 --- a/tools/urtrace/collector.cpp +++ b/tools/urtrace/collector.cpp @@ -36,7 +36,7 @@ constexpr uint16_t TRACE_FN_BEGIN = static_cast(xpti::trace_point_type_t::function_with_args_begin); constexpr uint16_t TRACE_FN_END = static_cast(xpti::trace_point_type_t::function_with_args_end); -constexpr std::string_view UR_STREAM_NAME = "ur"; +constexpr std::string_view UR_STREAM_NAME = "ur.call"; static logger::Logger out = logger::create_logger("collector", true);