Skip to content

Commit

Permalink
Add sycl equivalent to cuda events for profiling
Browse files Browse the repository at this point in the history
  • Loading branch information
aacostadiaz committed May 21, 2024
1 parent 856985b commit 3d84a8e
Show file tree
Hide file tree
Showing 10 changed files with 207 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,13 @@ run(Gemm_Op gemm_op)

void test_gemm(int m, int n, int k)
{
sycl::property_list prop = {
sycl::property::queue::in_order(),
sycl::property::queue::enable_profiling()
};

auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
syclcompat::set_default_queue(q);

std::cout << "M = " << m << std::endl;
std::cout << "N = " << n << std::endl;
Expand Down
8 changes: 8 additions & 0 deletions examples/cute/tutorial/sgemm_1_sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,14 @@ int main(int argc, char** argv) {
char transB = 'T';
if (argc >= 6) sscanf(argv[5], "%c", &transB);

sycl::property_list prop = {
sycl::property::queue::in_order(),
sycl::property::queue::enable_profiling()
};

auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
syclcompat::set_default_queue(q);

using TA = float;
using TB = float;
using TC = float;
Expand Down
8 changes: 8 additions & 0 deletions examples/cute/tutorial/sgemm_2_sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,14 @@ int main(int argc, char** argv) {
char transB = 'T';
if (argc >= 6) sscanf(argv[5], "%c", &transB);

sycl::property_list prop = {
sycl::property::queue::in_order(),
sycl::property::queue::enable_profiling()
};

auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
syclcompat::set_default_queue(q);

using TA = float;
using TB = float;
using TC = float;
Expand Down
8 changes: 8 additions & 0 deletions examples/cute/tutorial/sgemm_sm70_sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,14 @@ int main(int argc, char** argv) {
char transB = 'T';
if (argc >= 6) sscanf(argv[5], "%c", &transB);

sycl::property_list prop = {
sycl::property::queue::in_order(),
sycl::property::queue::enable_profiling()
};

auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
syclcompat::set_default_queue(q);

using TA = float;
using TB = float;
using TC = float;
Expand Down
8 changes: 8 additions & 0 deletions examples/cute/tutorial/sgemm_sm80_sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,14 @@ int main(int argc, char** argv) {
char transB = 'T';
if (argc >= 6) sscanf(argv[5], "%c", &transB);

sycl::property_list prop = {
sycl::property::queue::in_order(),
sycl::property::queue::enable_profiling()
};

auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
syclcompat::set_default_queue(q);

using TA = float;
using TB = float;
using TC = float;
Expand Down
10 changes: 10 additions & 0 deletions examples/sycl/common/example_runner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,16 @@ struct ExampleRunner {
auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
auto [M, N, K, L] = problem_shape_MNKL;

#if defined(CUTLASS_ENABLE_SYCL)
sycl::property_list prop = {
sycl::property::queue::in_order(),
sycl::property::queue::enable_profiling()
};

auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
syclcompat::set_default_queue(q);
#endif

stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
Expand Down
13 changes: 0 additions & 13 deletions examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,19 +29,6 @@
*
**************************************************************************************************/

#include "cutlass/gemm/device/gemm.h"
#include "cutlass/epilogue/collective/default_epilogue.hpp"
#include "cutlass/gemm/device/gemm_universal.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/collective/collective_mma.hpp"
#include "cutlass/util/GPU_Clock.hpp"

#include <cute/tensor.hpp>

#include "cutlass/util/device_memory.h"
#include "cutlass/util/packed_stride.hpp"
#include "cutlass/util/reference/device/gemm_complex.h"

#include "../common/example_runner.hpp"

using namespace cute;
Expand Down
9 changes: 7 additions & 2 deletions include/cutlass/gemm/device/gemm_universal_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@
// 3.x
#include "cutlass/gemm/kernel/gemm_universal.hpp"

#if defined(CUTLASS_ENABLE_SYCL)
#include "cutlass/util/event_manager.hpp"
#endif

////////////////////////////////////////////////////////////////////////////////

namespace cutlass::gemm::device {
Expand Down Expand Up @@ -403,10 +407,11 @@ class GemmUniversalAdapter<
const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);

#if defined (SYCL_INTEL_TARGET)
syclcompat::experimental::launch<device_kernel<GemmKernel>, DispatchPolicy::SubgroupSize>(sycl_grid, sycl_block, smem_size, params);
auto event = syclcompat::experimental::launch<device_kernel<GemmKernel>, DispatchPolicy::SubgroupSize>(sycl_grid, sycl_block, smem_size, params);
#else
syclcompat::launch<device_kernel<GemmKernel>>(sycl_grid, sycl_block, smem_size, params);
auto event = syclcompat::launch<device_kernel<GemmKernel>>(sycl_grid, sycl_block, smem_size, params);
#endif
EventManager::getInstance().addEvent(event);
#else
device_kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params);
#endif
Expand Down
33 changes: 18 additions & 15 deletions tools/util/include/cutlass/util/GPU_Clock.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,42 +32,49 @@
#pragma once

#if defined(CUTLASS_ENABLE_SYCL)
#include <syclcompat.hpp>
#include <chrono>
#include "cutlass/util/event_manager.hpp"
#else
#include <cuda_runtime.h>
#endif

struct GPU_Clock
{
#if !defined(CUTLASS_ENABLE_SYCL)
GPU_Clock() {
#if defined(CUTLASS_ENABLE_SYCL)
start_ = SyclEvent{};
stop_ = SyclEvent{};
#else
cudaEventCreate(&start_);
cudaEventCreate(&stop_);
cudaEventRecord(start_);
#endif
}

~GPU_Clock() {
#if defined(CUTLASS_ENABLE_SYCL)
syclEventDestroy(start_);
syclEventDestroy(stop_);
#else
cudaEventDestroy(start_);
cudaEventDestroy(stop_);
}
#endif
}

void start() {
#if defined(CUTLASS_ENABLE_SYCL)
syclcompat::get_default_queue().wait();
start_ = std::chrono::high_resolution_clock::now();
syclEventRecord(start_);
#else
cudaEventRecord(start_);
#endif
}

float milliseconds() {
#if defined(CUTLASS_ENABLE_SYCL)
syclcompat::get_default_queue().wait();
auto stop = std::chrono::high_resolution_clock::now();
std::chrono::duration<float, std::milli> time = stop - start_;
return time.count();
syclEventRecord(stop_);
syclEventSynchronize(start_, stop_);
float time;
syclEventElapsedTime(&time, start_, stop_);
return time;
#else
cudaEventRecord(stop_);
cudaEventSynchronize(stop_);
Expand All @@ -83,11 +90,7 @@ struct GPU_Clock

private:
#if defined(CUTLASS_ENABLE_SYCL)
typedef std::chrono::nanoseconds duration;
typedef std::chrono::high_resolution_clock high_resolution_clock;
typedef std::chrono::time_point<high_resolution_clock, duration> time_point;

time_point start_ = std::chrono::high_resolution_clock::now();
SyclEvent start_, stop_;
#else
cudaEvent_t start_, stop_;
#endif
Expand Down
133 changes: 133 additions & 0 deletions tools/util/include/cutlass/util/event_manager.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#pragma once

#include <vector>
#include <sycl/sycl.hpp>

class SyclEvent {
private:
int index;

public:
SyclEvent() : index(-1) {
};

int getIndex() const {
return index;
}

SyclEvent& operator=(int const& value) {
index = value;
return *this;
};
};

class EventManager {
public:
static EventManager& getInstance()
{
static EventManager instance;
return instance;
}
private:
EventManager() {}
std::vector<sycl::event> events{};
int recorders = 0;

public:
EventManager(EventManager const&) = delete;
void operator=(EventManager const&) = delete;

void startRecording(SyclEvent &event) {
if (event.getIndex() != -1) {
throw std::runtime_error("Event is already being recorded.");
}
recorders++;
event = static_cast<int>(events.size());
}

void addEvent(const sycl::event &event) {
events.push_back(event);
}

void eventDestroy() {
recorders--;
if (!recorders) {
events.clear();
}
}

float getEventElapsedTimeMs(SyclEvent const& begin, SyclEvent const& end) {
if (begin.getIndex() < 0 || begin.getIndex() > end.getIndex() || end.getIndex() > events.size()) {
throw std::runtime_error("Index out of bounds");
}

auto time_event = 0.0;
for (int i = begin.getIndex(); i < end.getIndex(); ++i) {
auto start_time = events[i].template get_profiling_info<
sycl::info::event_profiling::command_start>();

auto end_time = events[i].template get_profiling_info<
sycl::info::event_profiling::command_end>();

time_event += static_cast<float>(end_time - start_time);
}
return time_event * 1e-6;
}

void wait(SyclEvent const& begin, SyclEvent const& end) {
if (begin.getIndex() < 0 || begin.getIndex() > end.getIndex() || end.getIndex() > events.size()) {
throw std::runtime_error("Index out of bounds");
}

for (int i = begin.getIndex(); i < end.getIndex(); ++i) {
events[i].wait();
}
}

};

void syclEventDestroy(SyclEvent const& event) {
EventManager::getInstance().eventDestroy();
}

void syclEventRecord(SyclEvent &event) {
EventManager::getInstance().startRecording(event);
}

void syclEventSynchronize(SyclEvent const& begin, SyclEvent const& end) {
EventManager::getInstance().wait(begin, end);
}

void syclEventElapsedTime(float* time, SyclEvent const& begin, SyclEvent const& end) {
*time = EventManager::getInstance().getEventElapsedTimeMs(begin, end);
}

0 comments on commit 3d84a8e

Please sign in to comment.