From 49a986ab69051ab20e4c0c730752d867dc170376 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Wed, 14 Aug 2024 12:43:01 +0100 Subject: [PATCH 01/30] WIP: Introduce StreamK for PVC --- examples/sycl/pvc/pvc_gemm.cpp | 3 +- .../gemm/device/gemm_universal_adapter.h | 7 +- .../cutlass/gemm/kernel/gemm_universal.hpp | 1 + .../cutlass/gemm/kernel/intel_pvc_gemm.hpp | 3 +- .../gemm/kernel/intel_pvc_gemm_streamk.hpp | 545 +++++++++++ ...rsistent_tile_scheduler_params_streamk.hpp | 770 ++++++++++++++++ .../intel_pvc_tile_scheduler_streamk.hpp | 857 ++++++++++++++++++ .../cutlass/gemm/kernel/tile_scheduler.hpp | 16 + include/cutlass/workspace.h | 4 +- 9 files changed, 2200 insertions(+), 6 deletions(-) create mode 100644 include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp create mode 100644 include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp create mode 100644 include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp diff --git a/examples/sycl/pvc/pvc_gemm.cpp b/examples/sycl/pvc/pvc_gemm.cpp index 7959f7c0f4..79678e71b9 100644 --- a/examples/sycl/pvc/pvc_gemm.cpp +++ b/examples/sycl/pvc/pvc_gemm.cpp @@ -386,7 +386,8 @@ int main(int argc, const char** argv) using GemmKernel = cutlass::gemm::kernel::GemmUniversal< Shape, CollectiveMainloop, - CollectiveEpilogue + CollectiveEpilogue, + cutlass::gemm::StreamKScheduler >; using Gemm = cutlass::gemm::device::GemmUniversalAdapter; diff --git a/include/cutlass/gemm/device/gemm_universal_adapter.h b/include/cutlass/gemm/device/gemm_universal_adapter.h index 40a21b1078..e1bf6ee2d6 100644 --- a/include/cutlass/gemm/device/gemm_universal_adapter.h +++ b/include/cutlass/gemm/device/gemm_universal_adapter.h @@ -91,7 +91,8 @@ class GemmUniversalAdapter; template class GemmUniversalAdapter< GemmKernel_, - cute::enable_if_t::value>> + // cute::enable_if_t::value>> + cute::enable_if_t> { public: using GemmKernel = GemmKernel_; @@ -505,7 +506,7 @@ class GemmUniversalAdapter< ////////////////////////////// CUTLASS 2.x API ///////////////////////////////// //////////////////////////////////////////////////////////////////////////////// -template +/*template class GemmUniversalAdapter< GemmKernel_, cute::enable_if_t::value>> @@ -666,7 +667,7 @@ class GemmUniversalAdapter< return status; } -}; +};*/ //////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/kernel/gemm_universal.hpp b/include/cutlass/gemm/kernel/gemm_universal.hpp index bdfefa91cc..3b601005d4 100644 --- a/include/cutlass/gemm/kernel/gemm_universal.hpp +++ b/include/cutlass/gemm/kernel/gemm_universal.hpp @@ -65,5 +65,6 @@ struct IsCutlass3ArrayKernel>> + cute::enable_if_t + && cute::is_same_v>> { public: // diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp new file mode 100644 index 0000000000..0e128bb8a5 --- /dev/null +++ b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp @@ -0,0 +1,545 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/workspace.h" +#include "cutlass/kernel_hardware_info.hpp" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/dispatch_policy.hpp" +#include "cutlass/gemm/kernel/tile_scheduler.hpp" +#include "cute/tensor.hpp" + +/////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::kernel { + +/////////////////////////////////////////////////////////////////////////////// + +template < + class ProblemShape_, + class CollectiveMainloop_, + class CollectiveEpilogue_, + class TileScheduler_ +> +class GemmUniversal< + ProblemShape_, + CollectiveMainloop_, + CollectiveEpilogue_, + TileScheduler_, + cute::enable_if_t + && cute::is_same_v>> +{ +public: + // + // Type Aliases + // + using ProblemShape = ProblemShape_; + static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4, + "ProblemShape{} should be or "); + + // Mainloop derived types + using CollectiveMainloop = CollectiveMainloop_; + using TileShape = typename CollectiveMainloop::WorkgroupTileShape; + using WorkgroupTileShape = TileShape; + using TiledMma = typename CollectiveMainloop::TiledMma; + using ArchTag = typename CollectiveMainloop::ArchTag; + using ElementA = typename CollectiveMainloop::ElementA; + using StrideA = typename CollectiveMainloop::StrideA; + using ElementB = typename CollectiveMainloop::ElementB; + using StrideB = typename CollectiveMainloop::StrideB; + using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy; + using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator; + using ClusterShape = typename DispatchPolicy::ClusterShape; + using MainloopArguments = typename CollectiveMainloop::Arguments; + using MainloopParams = typename CollectiveMainloop::Params; + + // Epilogue derived types + using CollectiveEpilogue = CollectiveEpilogue_; + using ElementC = typename CollectiveEpilogue::ElementC; + using StrideC = typename CollectiveEpilogue::StrideC; + using ElementD = typename CollectiveEpilogue::ElementD; + using StrideD = typename CollectiveEpilogue::StrideD; + using EpilogueArguments = typename CollectiveEpilogue::Arguments; + using EpilogueParams = typename CollectiveEpilogue::Params; + +// static_assert(ArchTag::kMinComputeCapability >= 90); + + using TileSchedulerTag = TileScheduler_; + using TileScheduler = typename detail::TileSchedulerSelector< + TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler; + using TileSchedulerArguments = typename TileScheduler::Arguments; + using TileSchedulerParams = typename TileScheduler::Params; + + static constexpr uint32_t MaxThreadsPerBlock = CUTE_STATIC_V(size(TiledMma{})); + static constexpr uint32_t MinBlocksPerMultiprocessor = 1; + + /// Register requirement for Load and Math WGs +// static constexpr uint32_t LoadRegisterRequirement = 40; +// static constexpr uint32_t MmaRegisterRequirement = 232; + + // 1 stage ordered sequence between mainloop and epilogue producer load threads +// using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>; + + // Kernel level shared memory storage + struct SharedStorage { + using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage; + EpilogueTensorStorage epilogue; + }; + + static constexpr int SharedStorageSize = sizeof(SharedStorage); + + // Device side arguments + struct Arguments { + GemmUniversalMode mode{}; + ProblemShape problem_shape{}; + MainloopArguments mainloop{}; + EpilogueArguments epilogue{}; + KernelHardwareInfo hw_info{}; + TileSchedulerArguments scheduler{}; + }; + + // Kernel entry point API + struct Params { + GemmUniversalMode mode{}; + ProblemShape problem_shape{}; + MainloopParams mainloop{}; + EpilogueParams epilogue{}; + KernelHardwareInfo hw_info{}; + TileSchedulerParams scheduler{}; + void* workspace{nullptr}; + }; + + // + // Methods + // + + // Convert to underlying arguments. In this case, a simple copy for the aliased type. + static + Params + to_underlying_arguments(Arguments const& args, void* workspace) { + CUTLASS_TRACE_HOST("to_underlying_arguments():"); + + auto problem_shape = args.problem_shape; + + auto problem_shape_MNKL = append<4>(problem_shape, 1); + + // Get SM count if needed, otherwise use user supplied SM count + int sm_count = args.hw_info.sm_count; + if (sm_count <= 0) { + CUTLASS_TRACE_HOST(" WARNING: Arguments do not include a valid SM count.\n" + " For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count."); + sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id); + } + + CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count); + + KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count}; + + // Calculate workspace pointers + uint8_t* workspace_ptr = reinterpret_cast(workspace); + size_t workspace_offset = 0; + + void* scheduler_workspace = workspace_ptr; + workspace_offset += TileScheduler::template get_workspace_size( + args.scheduler, args.problem_shape, args.hw_info); + + void* epilogue_workspace = workspace_ptr + workspace_offset; + workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue); + + void* mainloop_workspace = nullptr; + // Precompute the sub tiles numbers in epilogue, pass into tile scheduler. Therefore it will be used + // in separate reduction scheme for streamk case, NumEpilogueSubTiles default value is 1, which means + // subtile will not be used, therefore separate reduction will not be enabled. + // constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{}); + TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments( + problem_shape_MNKL, TileShape{}, hw_info, args.scheduler, scheduler_workspace); + + return { + args.mode, + problem_shape, + CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, mainloop_workspace), + CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace), + hw_info, + scheduler, + workspace + }; + } + + static bool + can_implement(Arguments const& args) { + bool mode_implementable = args.mode == GemmUniversalMode::kGemm or + (args.mode == GemmUniversalMode::kBatched && rank(ProblemShape{}) == 4); + return mode_implementable && TileScheduler::can_implement(args.scheduler); + } + + static size_t + get_workspace_size(Arguments const& args) { + size_t workspace_size = 0; + + workspace_size += TileScheduler::template get_workspace_size( + args.scheduler, args.problem_shape, args.hw_info); + workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue); + return workspace_size; + } + + static cutlass::Status + initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr, + CudaHostAdapter* cuda_adapter = nullptr) { + Status status = Status::kSuccess; + uint8_t* workspace_ptr = reinterpret_cast(workspace); + size_t workspace_offset = 0; + + status = TileScheduler::template initialize_workspace( + args.scheduler, workspace_ptr + workspace_offset, args.problem_shape, args.hw_info); + workspace_offset += TileScheduler::template get_workspace_size( + args.scheduler, args.problem_shape, args.hw_info); + if (status != Status::kSuccess) { + return status; + } + + status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter); + workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue); + if (status != Status::kSuccess) { + return status; + } + + return status; + } + + // Computes the kernel launch grid shape based on runtime parameters + static dim3 + get_grid_shape(Params const& params) { + // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently + // TileSchedulerArguments args{}; + return TileScheduler::get_grid_shape(params.problem_shape, TileShape{}, params.hw_info); + } + + static dim3 + get_block_shape() { + return dim3(MaxThreadsPerBlock, 1, 1); + } + + CUTLASS_DEVICE + void + operator()(Params const& params, char* smem_buf) { + printf("I am here\n"); + + +/* using namespace cute; + using X = Underscore; + + // Preconditions + static_assert(size(TiledMma{}) == 256, "Cooperative kernel must have TiledMMA operating using 256 threads."); + static_assert(size<0>(TileShape{}) >= 128, + "Cooperative kernel requires Tile Size to be greater than or equal to 128 along the M-dimension."); + + static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>."); + static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>."); + static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>."); + static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>."); + + // Kernel level shared memory storage + SharedStorage& shared_storage = *reinterpret_cast(smem_buf); + + // Mainloop Load pipeline + using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline; + typename MainloopPipeline::Params mainloop_pipeline_params; + if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Mainloop) { + mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer; + } + if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) { + mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer; + } + mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0; + mainloop_pipeline_params.num_consumers = size(TiledMma{}); + mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes; + MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{}); + + // Epilogue Load pipeline + using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline; + typename EpiLoadPipeline::Params epi_load_pipeline_params; + if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Epilogue) { + epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer; + } + if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) { + epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer; + } + epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster(); + epi_load_pipeline_params.producer_arv_count = NumThreadsPerWarp; + epi_load_pipeline_params.consumer_arv_count = size(TiledMma{}); + if constexpr (CollectiveEpilogue::RequiresTransactionBytes) { + epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes; + } + EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params); + + // Epilogue Store pipeline + using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline; + typename EpiStorePipeline::Params epi_store_pipeline_params; + epi_store_pipeline_params.always_wait = true; + EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params); + + typename LoadWarpOrderBarrier::Params params_load_order_barrier; + params_load_order_barrier.group_id = producer_warp_role == ProducerWarpRole::Mainloop ? 0 : 1; + params_load_order_barrier.group_size = NumThreadsPerWarp; + LoadWarpOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, params_load_order_barrier); + + // Initialize starting pipeline states for the collectives + // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding) + typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state; + typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state; + + // For the DMA Load (producer) we start with an opposite phase + // i.e., we skip all waits since we know that the buffer is indeed empty + PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state(); + PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state(); + PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state(); + + auto cluster_wait_fn = [] () { + // We need this to guarantee that the Pipeline init is visible + // To all producers and consumer thread blocks in the Cluster + if constexpr (size(ClusterShape{}) > 1) { + cute::cluster_arrive_relaxed(); + return [] () { cute::cluster_wait(); }; + } + else { + syncthreads(); + return [] () {}; // do nothing + } + } (); + + // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK) + auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{}); + + // Get the appropriate blocks for this thread block -- potential for thread block locality + TiledMma tiled_mma; + auto blk_shape = TileShape{}; // (BLK_M,BLK_N,BLK_K) + + TileScheduler scheduler{params.scheduler}; + auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{}); + + // In a warp specialized kernel, collectives expose data movement and compute operations separately + CollectiveMainloop collective_mainloop; + + // Prepare and partition the input tensors. Expects a tuple of tensors where: + // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k,l) + // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k,l) + auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop); + static_assert(cute::tuple_size_v >= 2, "Output of load_init must have at least two elements (A, B)"); + + // Extract out partitioned A and B. + Tensor gA_mkl = get<0>(load_inputs); + Tensor gB_nkl = get<1>(load_inputs); + + // Wait for all thread blocks in the Cluster + cluster_wait_fn(); + + if (warp_group_role == WarpGroupRole::Producer) { + cutlass::arch::warpgroup_reg_dealloc(); + + CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue); + + // Mainloop Producer Warp + if (producer_warp_role == ProducerWarpRole::Mainloop) { + bool do_load_order_arrive = true; + while (work_tile_info.is_valid()) { + if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) { + work_tile_info = scheduler.fetch_next_work(work_tile_info); + continue; + } + + // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape + auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl)); + auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl)); + auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); + auto blk_coord = make_coord(m_coord, n_coord, _, l_coord); + + // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work. + auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape); + auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info); + auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl)); + + collective_mainloop.load( + params.mainloop, + mainloop_pipeline, + mainloop_pipe_producer_state, + load_inputs, + blk_coord, + k_tile_iter, work_k_tile_count, + lane_idx, + block_rank_in_cluster, + shared_storage.tensors.mainloop + ); + // Update starting pipeline state for the next tile + mainloop_pipe_producer_state.advance(work_k_tile_count); + + // Signal for the epilogue load warp to begin + if (do_load_order_arrive) { + load_order_barrier.arrive(); + do_load_order_arrive = false; + } + + // Get next work tile + work_tile_info = scheduler.fetch_next_work(work_tile_info); + } // Scheduler work fetch loop + + // Make sure all Consumer Warp Groups have been waited upon + collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state); + + } // Mainloop Producer Warp End + + // Epilogue Producer Warp + else if (producer_warp_role == ProducerWarpRole::Epilogue && collective_epilogue.is_producer_load_needed()) { + + if (!TileScheduler::requires_separate_reduction(params.scheduler) && work_tile_info.is_valid()) { + load_order_barrier.wait(); + } + while (work_tile_info.is_valid()) { + if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) { + // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape + auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl)); + auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl)); + auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); + auto blk_coord = make_coord(m_coord, n_coord, _, l_coord); + + epi_load_pipe_producer_state = + collective_epilogue.load( + epi_load_pipeline, + epi_load_pipe_producer_state, + problem_shape_MNKL, + blk_shape, + blk_coord, + tiled_mma, + lane_idx, + shared_storage.tensors.epilogue, + work_tile_info.reduction_subtile_idx() + ); + } + + // Get next work tile + work_tile_info = scheduler.fetch_next_work(work_tile_info); + } // Scheduler work fetch loop + + // Make sure all Consumer Warp Groups have been waited upon + collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state); + } // Epilogue Producer Warp End + } // Producer Warp Group End + + else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) { + cutlass::arch::warpgroup_reg_alloc(); + + CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue); + + // Do we potentially issue tail arrives for TMA stores, if epilogue load is waiting for it + bool do_store_tail = false; + while (work_tile_info.is_valid()) { + // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape + auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl)); + auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl)); + auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); + auto blk_coord = make_coord(m_coord, n_coord, _, l_coord); + auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape); + + // Allocate the accumulators for the (M,N) blk_shape + // + // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead. + auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape)); // (MMA,MMA_M,MMA_N) + if(TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) { + collective_mainloop.mma( + mainloop_pipeline, + mainloop_pipe_consumer_state, + accumulators, + work_k_tile_count, + mma_thread_idx, + shared_storage.tensors.mainloop, + params.mainloop + ); + + // Make sure the math instructions are done and free buffers before entering the epilogue + collective_mainloop.mma_tail( + mainloop_pipeline, + mainloop_pipe_consumer_state, + work_k_tile_count + ); + + // Update starting mainloop pipeline state for the next tile + mainloop_pipe_consumer_state.advance(work_k_tile_count); + } + // Index of warp group within consumer warp groups + int consumer_warp_group_idx = canonical_warp_group_idx() - NumLoadWarpGroups; + + // Perform reduction across splits, if needed + TileScheduler::fixup( + params.scheduler, work_tile_info, accumulators, NumMmaWarpGroups, consumer_warp_group_idx); + + if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) { + // Epilogue and write to gD + auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] = + collective_epilogue.store( + epi_load_pipeline, + epi_load_pipe_consumer_state, + epi_store_pipeline, + epi_store_pipe_producer_state, + problem_shape_MNKL, + blk_shape, + blk_coord, + accumulators, + tiled_mma, + mma_thread_idx, + shared_storage.tensors.epilogue, + work_tile_info.reduction_subtile_idx() + ); + epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next; + epi_store_pipe_producer_state = epi_store_pipe_producer_state_next; + do_store_tail = true; + } + + // Get next work tile + work_tile_info = scheduler.fetch_next_work(work_tile_info); + } // Scheduler work fetch loop + + if (do_store_tail) { + collective_epilogue.store_tail( + epi_load_pipeline, + epi_load_pipe_consumer_state, + epi_store_pipeline, + epi_store_pipe_producer_state + ); + } + } // Consumer Warp Groups End + */ + } + +}; + +/////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm::kernel diff --git a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp new file mode 100644 index 0000000000..96e1368b2b --- /dev/null +++ b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp @@ -0,0 +1,770 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +/*! \file + \brief Parameters structures for persistent tile schedulers +*/ + +#include "cutlass/coord.h" +#include "cutlass/kernel_hardware_info.h" +#include "cutlass/workspace.h" +#include "cutlass/platform/platform.h" +#include "cutlass/fast_math.h" +#include "cutlass/gemm_coord.h" +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace kernel { +namespace detail { + +//////////////////////////////////////////////////////////////////////////////// +// Parameters for Intel PVC persistent stream-K scheduler +struct PersistentTileSchedulerIntelPVCStreamKParams { + + // Strategies for computing reductions between CTAs computing portions of a given output tile + enum class ReductionMode { + // Participating CTAs perform reduction in a turnstile fashion in order of the K extent + // covered by each CTA. This requires a lock to be held exclusively be the CTA that is + // currently accumulating. + // + // Turnstile accumulation ensures deterministic numeric behavior when using this mode. + Deterministic, + + // Participating CTAs perform reduction atomically to the same workspace (mostly) without locking. + // Locks are used only to wait for the first CTA to write its partial values (to initialize the + // workspace), and for all but the final CTA to have accumulated (so that the final CTA can load + // the accumulated value and accumulate it into registers on top of which the epilogue will + // be performed). + // + // Due to the nondeterminsitic ordering of accumulation, deterministic numeric behavior cannot + // be guaranteed with this mode (e.g., floating-point rounding error will depend on the order + // of accumulation) + Nondeterministic + }; + + // Strategies for decomposing the problem + enum class DecompositionMode { + // Use a heuristic to determine whether data-parallel, split-K, or stream-K decomposition should be performed + Heuristic, + // Force a data-parallel decomposition + DataParallel, + // Force a split-K decomposition. This should be paired with setting the `splits` parameter + SplitK, + // Force a stream-K decomposition + StreamK + }; + + FastDivmodU64 divmod_batch_{}; + + // We divide up the number of stream-K tiles amongst G groups of stream-K units. + // The stream-K units within a group collaborate to comptue over the `sk_tiles / G` + // tiles assigned to that group. Non-unit group sizes can help to preserve L2 locality of + // partial chunks computed by stream-K units -- units 0 in each group will compute identical K extents + // of tiles that would be assigned in the same wave according to the rasterization order of the + // data-parallel formulation of the problem. + FastDivmodU64 divmod_sk_groups_{}; + + // Number of stream-K units in each group + FastDivmodU64 divmod_sk_units_per_group_{}; + + uint64_t units_per_problem_ = 0; + FastDivmod divmod_tiles_per_output_tile_{}; + + // The splitting factor to be used in a split-K decomposition of the problem. + // If this is set to a value greater than 1, stream-K decomposition logic + // is bypassed in favor of a split-K decomposition. + FastDivmod divmod_splits_{}; + + // Number of stream-K or split-K work units that compute an extra k iteration. + // This is done to handle residuals in dividing up the k iteration space. + // For stream-K, since the actual assignment of work to stream-K units will be done + // at the granularity of a cluster, we store only the number of big clusters. + uint32_t big_units_ = 0; + + // The number of groups of stream-K units that will process an extra stream-K tile cluster. + uint32_t big_groups_ = 0; + + // Workspace for holding partial accumulators to be reduced across stream-K/split-K units + void* reduction_workspace_ = nullptr; + + // Number of tiles covered by stream-K work units + uint32_t sk_tiles_ = 0; + + // Number of work units computing stream-K tiles + uint32_t sk_units_ = 0; + + // Number of tiled k iterations computed by each stream-K work unit. This + // can potentially cover more than one output tile. + FastDivmod divmod_k_tiles_per_sk_unit_{}; + // Number of tiled k iterations computed by each "big" stream-K units, which + // processes one more K chunk than a "normal" stream-K unit. + FastDivmod divmod_k_tiles_per_sk_big_unit_{}; + + // Strategy to use when reducing between collaborating CTAs + ReductionMode reduction_mode_ = ReductionMode::Deterministic; + + // Minimum number of k tiles that can be assigned to a stream-K unit + static constexpr uint32_t min_iters_per_sk_unit_ = 8u; + + // Maximum number of groups of stream-K units + static constexpr uint32_t max_sk_groups_ = 8u; + + // ktile start from even for each cta + uint32_t ktile_start_alignment_count { 1u }; + + // Returns the maximum number of peers that can collaborate on a given output tile + CUTLASS_HOST_DEVICE + static uint32_t + max_peers_per_tile(uint64_t sk_units, uint64_t sk_tiles) { + // When we can divide up our SK units to SK tiles evenly, the number of peers + // per SK tile is exactly (sk_units_ / sk_tiles_). In cases where this division + // is not exact, some tiles will need to be covered by additional SK units. Because + // the extra work can occur at both the beginning and the end of the SK tile, at + // most 2 extra peers will be needed. + return static_cast(sk_units / sk_tiles + 2); + } + + // Initializes members. This variant of the method should only be used when + // problem_shape and tile_shape contain modes of only rank 1. + void + initialize( + BatchedGemmCoord problem_shape, + GemmCoord tile_shape, + KernelHardwareInfo hw_info, + int splits, + ReductionMode reduction_mode, + DecompositionMode decomposition_mode, + void* workspace + ) { + + dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, tile_shape); + // Number of k tiles in each output tile + uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k(); + + initialize( + problem_blocks, + k_tiles_per_output_tile, + hw_info, + splits, + reduction_mode, + decomposition_mode, + workspace + ); + } + + // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions. + // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1, + // for which using CuTe algebra for calculating tile shapes is easiest. + void + initialize( + dim3 problem_blocks, + uint32_t k_tiles_per_output_tile, + KernelHardwareInfo hw_info, + int splits, + ReductionMode reduction_mode, + DecompositionMode decomposition_mode, + void* workspace + ) { + + auto problem_blocks_l = problem_blocks.z; + + auto problem_blocks_m = problem_blocks.x; + auto problem_blocks_n = problem_blocks.y; + uint64_t output_tiles = problem_blocks_m * problem_blocks_n * problem_blocks_l; + + // Reduction workspace is at the beginning of the workspace. Lock workspace follows. + void* reduction_workspace = workspace; + + if (decomposition_mode == DecompositionMode::SplitK || + (decomposition_mode == DecompositionMode::Heuristic && splits > 1)) { + // Short circuit to basic split-K decomposition + + // Don't split by more than the available number of SMs + if (splits > hw_info.sm_count) { + splits = hw_info.sm_count; + } + + // Don't split by more than the K tile iterations + // + // splits is almost certainly nonnegative here (e.g., hw_info.sm_count, + // despite being an int, is a count), so it can safely be converted to unsigned + // in the comparison to avoid a signed-unsigned comparison warning-as-error. + if (static_cast(splits) > k_tiles_per_output_tile) { + splits = k_tiles_per_output_tile; + } + + // If splits == k_tiles_per_output_tiles, there will be one k_tile per cta + // and this violate k_tile start from even requirements. Thus we need to + // reduce the number of splits. + if (ktile_start_alignment_count > 1u && + static_cast(splits) == k_tiles_per_output_tile) { + splits = k_tiles_per_output_tile / ktile_start_alignment_count; + } + + set_params_basic( + problem_blocks_m, + problem_blocks_n, + problem_blocks_l, + splits, + k_tiles_per_output_tile, + reduction_workspace, + reduction_mode + ); + return; + } + + // Calculate the maximum number of blocks from clusters of shape cluster_shape that we + // can fit within sm_count SMs. + dim3 grid = get_grid_shape( + problem_blocks, + hw_info + ); + + uint64_t ctas_per_wave = grid.x * grid.y; + // The number of output tiles to be computed in stream-K and data-parallel fashion, respectively. + uint32_t sk_tiles = get_num_sk_tiles( + output_tiles, + ctas_per_wave, + k_tiles_per_output_tile, + decomposition_mode + ); + uint64_t dp_tiles = output_tiles - sk_tiles; + + // Calculate the number of work units covering the data-parallel and stream-K tiles. + // A "work unit" is a single index in the linearized ID space used by the scheduler. + // We distinguish it from a "block," which is typically tied to a hardware unit + // (e.g., the callers into this scheduler will be persistent thread blocks). + // A work unit can encompass multiple output tiles worth of work (as will be the + // case for stream-K blocks). + // Since splitting is not required for data-parallel tiles, only one data-parallel unit + // is needed per data-parallel tile. + uint64_t dp_units = dp_tiles; + + uint64_t ctas_per_sk_wave = ctas_per_wave; + uint64_t sk_units = get_num_sk_units(ctas_per_sk_wave, sk_tiles, k_tiles_per_output_tile); + + if (decomposition_mode == DecompositionMode::DataParallel || + (decomposition_mode == DecompositionMode::Heuristic && sk_tiles == 0) || + sk_units == 0) { + // Short circuit to basic data-parallel decomposition + set_params_basic( + problem_blocks_m, + problem_blocks_n, + problem_blocks_l, + /* splits = */ 1, + k_tiles_per_output_tile, + reduction_workspace, + reduction_mode + ); + return; + } + + bool do_separate_reduction = false; + // should_perform_separate_reduction( + // epilogue_subtile, sk_units, sk_tiles, dp_tiles, ctas_per_wave); + + // Determine the number of stream-K groups that will be used. Choosing the + // fast moving dimension of the underlying grid. + uint32_t max_groups_problem = problem_blocks_n; + + // Select the number of groups that will be use. We start with the maximum + // number of potential groups, and iterate down looking for a group size that + // evenly divides the stream-K units and tiles, and for which the resulting + // number of K tiles per stream-K unit remains above min_iters_per_sk_unit_ + + uint32_t groups = platform::min(max_groups_problem, uint32_t(max_sk_groups_)); + + // Grouping is disabled when separate reduction is used + // if (do_separate_reduction) { + // groups = 1; + // } + + uint32_t fallback_groups = 0; + + auto sk_splits_too_small = [&](uint32_t g) { + // Check whether the number of K tiles computed per stream-K unit is less + // than min_iters_per_sk_unit_ + auto total_sk_tiles = sk_tiles / g; + auto total_sk_k_tiles = total_sk_tiles * k_tiles_per_output_tile; + auto k_tiles_per_sk_unit = total_sk_k_tiles / (sk_units / g); + return k_tiles_per_sk_unit < min_iters_per_sk_unit_; + }; + + auto is_ideal_grouping = [&](uint32_t g) { + // An ideal grouping will evenly divide stream-K clusters, evenly divide + // stream-K tiles, and not result in stream-K splits that are too small. + return (sk_units % g == 0) && (sk_tiles % g == 0) && !sk_splits_too_small(g); + }; + + auto is_valid_grouping = [&](uint32_t g) { + // A grouping is valid, but not ideal, if it evenly divides the + // stream-K clusters and does not result in stream-K splits that are + // too small. Such a setting can be used as a fallback option in the + // case that an ideal grouping is not achievable + return sk_units % g == 0 && !sk_splits_too_small(g); + }; + + while (groups > 1 && !is_ideal_grouping(groups)) { + if (fallback_groups == 0 && is_valid_grouping(groups)) { + // Set fallback groups once in preference for a larger number of groups. + fallback_groups = groups; + } + --groups; + } + + // If groups == 1, we did not find a group count that satisfies all criteria. If we have + // found a fallback group count, use this instead. + if (groups == 1 && fallback_groups > 0) { + groups = fallback_groups; + } + + auto sk_units_per_group = sk_units / groups; + + // sk_tiles is guaranteed to be divisible by cluster_size because it is calculated as: + // sk_tiles = (waves <= 2) ? total_tiles : (sm_count + (total_tiles % sm_count)) + // Both total_tiles and sm_count are multiples of cluster size due to padding added + // prior to kernel launch. + uint64_t sk_tiles_per_group = sk_tiles / groups; + + // Groups that will process an extra stream-K tile cluster. These differ from "big_units," which + // are stream-K units within a group that process an extra K chunk. + uint64_t sk_big_groups = sk_tiles % groups; + + uint64_t k_tiles_per_group = k_tiles_per_output_tile * sk_tiles_per_group; + + // Number of k tiles computed per stream-K unit + uint64_t k_tiles_per_sk_unit = k_tiles_per_group / sk_units_per_group; + + uint32_t reduction_units = 0; + + // Use separate reduction when we have less than one wave of output tiles (dp_tiles == 0) + // and when each tile will be operated on by at least two stream-K units (sk_units > 2 * sk_tiles) + // if (do_separate_reduction) { + // // Each reduction unit will reduce the partials of an epilogue subtile for + // // a given output tile and compute the epilogue. Thus, there are as many reduction + // // units as there are epilogue subtiles. + // reduction_units = sk_tiles * epilogue_subtile; + // } + // else + if (decomposition_mode == DecompositionMode::Heuristic && sk_tiles < sk_units && sk_units % sk_tiles == 0) { + // If the number of stream-K units is a multiple of the number of stream-K tiles, then + // the problem can leverage a basic split-K decomposition for the stream-K tiles. + // This case happens when separate reduction is disable. + uint32_t sk_splits = static_cast(sk_units / sk_tiles); + set_params_basic( + problem_blocks_m, + problem_blocks_n, + problem_blocks_l, + sk_splits, + k_tiles_per_output_tile, + reduction_workspace, + reduction_mode + ); + return; + } + + divmod_batch_ = FastDivmodU64(problem_blocks_m * problem_blocks_n); + divmod_tiles_per_output_tile_ = FastDivmod(k_tiles_per_output_tile); + divmod_sk_groups_ = FastDivmodU64(static_cast(groups)); + divmod_sk_units_per_group_ = FastDivmodU64(static_cast(sk_units / groups)); + + divmod_splits_ = FastDivmod(1); + units_per_problem_ = static_cast(dp_units + sk_units); + + // Assign big_units_ assuming that group count == 1. This is unused by stream-K + // when group count > 1. + big_units_ = static_cast(k_tiles_per_group % k_tiles_per_sk_unit); + + big_groups_ = static_cast(sk_big_groups); + reduction_workspace_ = reduction_workspace; + sk_tiles_ = sk_tiles; + sk_units_ = static_cast(sk_units); + divmod_k_tiles_per_sk_unit_ = FastDivmod(static_cast(k_tiles_per_sk_unit)); + divmod_k_tiles_per_sk_big_unit_ = FastDivmod(static_cast(k_tiles_per_sk_unit + 1)); + reduction_mode_ = reduction_mode; + } + + // Get the number of CTA tiles in this problem. This variant of the method should only be used when + // problem_shape and tile_shape contain modes of only rank 1. + CUTLASS_HOST_DEVICE + static dim3 + get_tiled_cta_shape_mnl(BatchedGemmCoord problem_shape, GemmCoord cta_shape) { + auto cta_m = (problem_shape.m() + cta_shape.m() - 1) / cta_shape.m(); + auto cta_n = (problem_shape.n() + cta_shape.n() - 1) / cta_shape.n(); + + return { + static_cast(cta_m), + static_cast(cta_n), + static_cast(problem_shape.batch()) + }; + } + + CUTLASS_HOST_DEVICE + static dim3 + get_grid_shape( + dim3 problem_blocks, + KernelHardwareInfo hw_info + ) { + return dim3{problem_blocks.y, problem_blocks.x, 1}; + } + + // Returns the number of stream-K tiles that will be computed amongst `output_tiles` total + // output tiles on a device with `ctas_per_wave` CTAs in each wave. + static uint32_t + get_num_sk_tiles( + uint64_t output_tiles, + uint64_t ctas_per_wave, + uint32_t k_tiles_per_output_tile, + DecompositionMode decomposition_mode + ) { + uint32_t full_waves = static_cast(output_tiles / ctas_per_wave); + uint32_t total_waves = static_cast((output_tiles + ctas_per_wave - 1) / ctas_per_wave); + + if (decomposition_mode == DecompositionMode::DataParallel || + decomposition_mode == DecompositionMode::SplitK) { + return 0; + } + + // If there is wave quantization, assign the first two waves worth of tiles to be + // covered by stream-K work and the remainder to be data-parallel. Since we know + // that full_waves == total_waves - 1 in this case, the number of data-parallel + // waves is simply full_waves-1 (unless full_waves == 0). + uint32_t dp_waves = full_waves > 1 ? full_waves - 1 : 0; + uint64_t dp_tiles = dp_waves * ctas_per_wave; + uint64_t sk_tiles = output_tiles - dp_tiles; + + if (decomposition_mode == DecompositionMode::Heuristic) { + if (full_waves == total_waves || k_tiles_per_output_tile <= min_iters_per_sk_unit_) { + // All tiles will be data-parallel tiles if there is either no quantization + // or if there is no work to be split. + return 0; + } + + // + // The final wave is not full. Perform some stream-K work. + // + + // Rudimentary heuristic: prefer data-parallel decomposition if we have more than + // one wave and the tail wave is more than half full. This is subject to change. + uint64_t tail_tiles = output_tiles - (full_waves * ctas_per_wave); + if (2 * tail_tiles >= ctas_per_wave) { + return 0; + } + } + + return static_cast(sk_tiles); + } + + CUTLASS_HOST_DEVICE + static uint64_t + get_num_sk_units(uint64_t ctas_per_sk_wave, uint32_t sk_tiles, uint32_t k_tiles_per_output_tile) { + // If there are stream-K tiles to compute and a sufficiently large number of k iterations + // across them, they will be covered by a single wave of persistent threadblocks. Thus, there + // will be as many work units as there are threadblocks in a single wave. + // + // When the total k iterations across stream-K tiles is too small to justify distributing + // across an entire wave of blocks, we instead distribute the iterations over a smaller + // set of blocks. + + // Calculate the number of stream-K units that would be needed if each stream-K unit + // computed the minimum allowable k iterations. Truncate this to be in units of clusters. + + // Number of k iterations computed by the stream-K units as a whole + uint64_t k_tiles_sk_total = k_tiles_per_output_tile * sk_tiles; + + // Calculate the number of stream-K units that would be needed if each stream-K unit + // computed the minimum allowable k iterations. Truncate this to be in units of clusters. + uint64_t min_sized_sk_units = (k_tiles_sk_total / min_iters_per_sk_unit_); + + uint64_t sk_units = platform::min(ctas_per_sk_wave, min_sized_sk_units); + return sk_units; + } + + // Calculates the size of the workspace needed for holding reduction barriers + CUTLASS_HOST_DEVICE + static size_t + get_barrier_workspace_size(uint64_t num_tiles, uint32_t barrier_bits) { + size_t workspace_bits = num_tiles * static_cast(barrier_bits); + return round_up_to_l2_alignment(bits_to_bytes(workspace_bits)); + } + + // Calculates the size of the workspace needed for holding partial outputs from splits + CUTLASS_HOST_DEVICE + static size_t + get_reduction_workspace_size(uint64_t num_tiles, GemmCoord tile_shape, uint32_t accumulator_bits, uint32_t num_accumulator_mtxs = 1) { + size_t output_tile_size = tile_shape.m() * tile_shape.n(); + size_t workspace_bits = accumulator_bits * output_tile_size * num_tiles * num_accumulator_mtxs; + return round_up_to_l2_alignment(bits_to_bytes(workspace_bits)); + } + + static void + get_workspace_component_sizes( + dim3 problem_blocks, + uint32_t k_tiles_per_output_tile, + GemmCoord tile_shape, + size_t& barrier_workspace_size, + size_t& reduction_workspace_size, + KernelHardwareInfo const& hw_info, + int splits, + DecompositionMode decomposition_mode, + uint32_t barrier_bits, + uint32_t accumulator_bits) { + + // Workspace is needed only for output tiles that will be split. Thus, we first determine the number + // of output tiles that will be split, and then calculate the workspace needed to cover these. + uint64_t output_tiles = problem_blocks.x * problem_blocks.y * problem_blocks.z; + + if (decomposition_mode == DecompositionMode::DataParallel) { + barrier_workspace_size = 0; + reduction_workspace_size = 0; + } + else if (splits > 1 && + (decomposition_mode == DecompositionMode::SplitK || decomposition_mode == DecompositionMode::Heuristic)) { + // Basic split-K variant requires workspace for all output tiles + barrier_workspace_size = get_barrier_workspace_size(output_tiles, barrier_bits); + reduction_workspace_size = get_reduction_workspace_size(output_tiles, tile_shape, accumulator_bits); + } + else { + KernelHardwareInfo new_hw_info; + new_hw_info.device_id = hw_info.device_id; + new_hw_info.sm_count = hw_info.sm_count; + if (new_hw_info.sm_count <= 0) { + CUTLASS_TRACE_HOST(" WARNING: Arguments do not include a valid SM count.\n" + " For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count."); + new_hw_info.sm_count = KernelHardwareInfo::query_device_multiprocessor_count(new_hw_info.device_id); + } + + dim3 grid = get_grid_shape( + problem_blocks, + new_hw_info + ); + uint64_t ctas_per_wave = grid.x * grid.y; + uint32_t sk_tiles = get_num_sk_tiles( + output_tiles, + ctas_per_wave, + static_cast(k_tiles_per_output_tile), + decomposition_mode + ); + uint64_t ctas_per_sk_wave = ctas_per_wave; + uint64_t sk_units = get_num_sk_units(ctas_per_sk_wave, sk_tiles, k_tiles_per_output_tile); + uint64_t dp_tiles = output_tiles - sk_tiles; + + uint64_t reduction_tiles = sk_tiles; + + // Though separate reduction requires a larger reduction workspace, only one barrier + // is needed per output tile. Each peer will increment the barrier by one once the peer has + // written its accumulator to scratch space. The separate reduction unit will only begin + // performing the reduction when the barrier has reached the number of peers for the output tile. + barrier_workspace_size = get_barrier_workspace_size(sk_tiles, barrier_bits); + reduction_workspace_size = get_reduction_workspace_size(reduction_tiles, tile_shape, accumulator_bits); + } + } + + // Get the amount of scratch workspace needed for the kernel. This variant of the method should only be used when + // problem_shape and tile_shape contain modes of only rank 1. + static size_t + get_workspace_size( + BatchedGemmCoord problem_shape, + GemmCoord tile_shape, + KernelHardwareInfo const& hw_info, + int splits, + DecompositionMode decomposition_mode, + uint32_t barrier_bits, + uint32_t element_accumulator_bits) { + + dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, tile_shape); + uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k(); + + return get_workspace_size( + problem_blocks, + k_tiles_per_output_tile, + tile_shape, + hw_info, + splits, + decomposition_mode, + barrier_bits, + element_accumulator_bits + ); + } + + // Version of get_workspace_size that takes in as input the number of CTAs in the M and N dimensions. + // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1, + // for which using CuTe algebra for calculating tile shapes is easiest. + static size_t + get_workspace_size( + dim3 problem_blocks, + uint32_t k_tiles_per_output_tile, + GemmCoord tile_shape, + KernelHardwareInfo const& hw_info, + int splits, + DecompositionMode decomposition_mode, + uint32_t barrier_bits, + uint32_t element_accumulator_bits) { + + size_t barrier_workspace_size = 0; + size_t reduction_workspace_size = 0; + + get_workspace_component_sizes( + problem_blocks, + k_tiles_per_output_tile, + tile_shape, + barrier_workspace_size, + reduction_workspace_size, + hw_info, + splits, + decomposition_mode, + barrier_bits, + element_accumulator_bits + ); + + return barrier_workspace_size + reduction_workspace_size; + } + + // Initialize the workspace to be used for the kernel. This variant of the method should only be used when + // problem_shape and tile_shape contain modes of only rank 1. + static cutlass::Status + initialize_workspace( + void* workspace, + BatchedGemmCoord problem_shape, + GemmCoord tile_shape, + KernelHardwareInfo const& hw_info, + int splits, + DecompositionMode decomposition_mode, + uint32_t barrier_bits, + uint32_t element_accumulator_bits) { + + dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, tile_shape); + uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k(); + + return initialize_workspace( + workspace, + problem_blocks, + k_tiles_per_output_tile, + tile_shape, + hw_info, + splits, + decomposition_mode, + barrier_bits, + element_accumulator_bits + ); + } + + // Version of initialize_workspace that takes in as input the number of CTAs in the M and N dimensions. + // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1, + // for which using CuTe algebra for calculating tile shapes is easiest. + static cutlass::Status + initialize_workspace( + void* workspace, + dim3 problem_blocks, + uint32_t k_tiles_per_output_tile, + GemmCoord tile_shape, + KernelHardwareInfo const& hw_info, + int splits, + DecompositionMode decomposition_mode, + uint32_t barrier_bits, + uint32_t element_accumulator_bits) { + + uint64_t barrier_workspace_size = 0; + uint64_t reduction_workspace_size = 0; + + get_workspace_component_sizes( + problem_blocks, + k_tiles_per_output_tile, + tile_shape, + barrier_workspace_size, + reduction_workspace_size, + hw_info, + splits, + decomposition_mode, + barrier_bits, + element_accumulator_bits + ); + + if (barrier_workspace_size > 0) { + if (workspace == nullptr) { + return Status::kErrorWorkspaceNull; + } + + // Only the barrier workspace needs to be cleared for stream-K. + // Barrier workspace follows reduction workspace. + uint8_t* barrier_workspace = reinterpret_cast(workspace) + reduction_workspace_size; + return zero_workspace(static_cast(barrier_workspace), barrier_workspace_size); + } + + return Status::kSuccess; + } + + void + set_params_basic( + uint32_t blocks_m, + uint32_t blocks_n, + uint32_t blocks_l, + uint32_t splits, + uint32_t k_tiles_per_output_tile, + void* reduction_workspace, + ReductionMode reduction_mode) { + + divmod_batch_ = FastDivmodU64(blocks_m * blocks_n); + divmod_tiles_per_output_tile_ = FastDivmod(k_tiles_per_output_tile); + divmod_sk_groups_ = FastDivmodU64(1u); + divmod_splits_ = FastDivmod(splits); + units_per_problem_ = blocks_m * blocks_n * blocks_l; + big_units_ = k_tiles_per_output_tile % splits; + reduction_workspace_ = reduction_workspace; + reduction_mode_ = reduction_mode; + divmod_k_tiles_per_sk_unit_ = FastDivmod(k_tiles_per_output_tile / splits); + divmod_k_tiles_per_sk_big_unit_ = FastDivmod(k_tiles_per_output_tile / splits + 1); + + // No stream-K work is performed for "basic" data-parallel and split-K decompositions + sk_tiles_ = 0; + sk_units_ = 0; + divmod_sk_units_per_group_ = FastDivmodU64(1u); + } + + private: + // Round up number of bytes to the nearest multiple of L2 cache line alignment + CUTLASS_HOST_DEVICE + static size_t + round_up_to_l2_alignment(size_t bytes) { + constexpr size_t L2CacheLineSizeBytes = 128u; + return (bytes + L2CacheLineSizeBytes - 1) / L2CacheLineSizeBytes * L2CacheLineSizeBytes; + } +}; + +//////////////////////////////////////////////////////////////////////////////// +} // namespace detail +} // namespace kernel +} // namespace gemm +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp new file mode 100644 index 0000000000..b32c65296e --- /dev/null +++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp @@ -0,0 +1,857 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#pragma once + +#include "cutlass/barrier.h" +#include "cutlass/block_striped.h" +#include "cutlass/fast_math.h" +#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp" +#include "cutlass/kernel_hardware_info.hpp" +#include "cute/layout.hpp" +#include "cute/tensor.hpp" +#include "cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp" + +namespace cutlass::gemm::kernel::detail { + +// Persistent Thread Block (TB) scheduler leveraging stream-K decomposition +template < + class TileShape +> +class PersistentTileSchedulerIntelPVCStreamK { + // + // Data members + // + +private: + uint64_t current_work_linear_idx_ = 0; + +public: + + // Use a dummy barrier manager to simply get the type used to store the barrier + using BarrierType = typename NamedBarrierManager<1>::T; + + using Params = PersistentTileSchedulerIntelPVCStreamKParams; + using ReductionMode = Params::ReductionMode; + using DecompositionMode = Params::DecompositionMode; + + struct WorkTileInfo { + int32_t M_idx = 0; + int32_t N_idx = 0; + int32_t K_idx = 0; + int32_t L_idx = 0; + + // Number of k tiles to compute for this unit of work. For stream-K, this + // can indicate the number of K tiles across multiple output tiles. + uint32_t k_tile_count = 0; + + // Number of k tiles remaining for the work unit as a whole + uint32_t k_tile_remaining = 0; + + // Whether this unit of work is the final split for the given tile + bool is_separate_reduction = false; + + CUTLASS_HOST_DEVICE + bool + is_valid() const { + // A work tile that computes no K tiles is invalid unless it is a separate-reduction work tile + // (which only performs reduction and epilogue) + return k_tile_count > 0 || is_separate_reduction; + } + + CUTLASS_HOST_DEVICE + bool + is_reduction_unit() const { + return is_separate_reduction; + } + + CUTLASS_HOST_DEVICE + int32_t + reduction_subtile_idx() const { + // For separate reduction units, the K_idx of the work tile is unused. + // Therefore, we override it to contain the subtile of that the reduction + // unit operates on. + return is_reduction_unit() ? K_idx : -1; + } + + CUTLASS_HOST_DEVICE + static WorkTileInfo + invalid_work_tile() { + return {-1, -1, -1, -1, 0}; + } + + CUTLASS_HOST_DEVICE + bool + is_final_split(uint32_t k_tiles_per_output_tile) const { + return (K_idx + k_tile_count) == k_tiles_per_output_tile; + } + }; + + struct Arguments { + + Arguments() = default; + Arguments(Arguments const&) = default; + Arguments(Arguments&&) = default; + + CUTLASS_HOST_DEVICE + Arguments& + operator=(Arguments const& args) { + splits = args.splits; + reduction_mode = args.reduction_mode; + decomposition_mode = args.decomposition_mode; + return *this; + } + + CUTLASS_HOST_DEVICE + Arguments& + operator=(Arguments&& args) noexcept { + splits = args.splits; + reduction_mode = args.reduction_mode; + decomposition_mode = args.decomposition_mode; + return *this; + } + + CUTLASS_HOST_DEVICE + Arguments(int splits_) : splits(splits_) {} + + CUTLASS_HOST_DEVICE + Arguments(int splits_, DecompositionMode decomposition_mode_) : + splits(splits_), + decomposition_mode(decomposition_mode_) {} + + // The splitting factor to be used in a split-K decomposition of the problem. + // If this is set to a value greater than 1, stream-K decomposition logic + // is bypassed in favor of a split-K decomposition. + int splits = 1; + ReductionMode reduction_mode = ReductionMode::Deterministic; + DecompositionMode decomposition_mode = DecompositionMode::Heuristic; + }; + + // Sink scheduler params as a member + Params scheduler_params; + + // + // Methods + // + + template + static Params + to_underlying_arguments( + ProblemShape problem_shape, + TileShape tile_shape, + KernelHardwareInfo const& hw_info, + Arguments const& args, + void* workspace) { + + static_assert(cute::is_static::value); + + auto problem_shape_mnkl = cute::append<4>(problem_shape, cute::Int<1>{}); + dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape); + uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{}))); + + Params params; + params.initialize( + problem_blocks, + k_tile_per_output_tile, + hw_info, + args.splits, + args.reduction_mode, + args.decomposition_mode, + workspace + ); + return params; + } + + static bool + can_implement(Arguments const& args) { + // Split count > 1 is only valid for heuristic and split-K decomposition modes + return (args.splits == 1 || + args.decomposition_mode == DecompositionMode::Heuristic || + args.decomposition_mode == DecompositionMode::SplitK); + } + + CUTLASS_HOST_DEVICE + PersistentTileSchedulerIntelPVCStreamK() { }; + + CUTLASS_HOST_DEVICE + PersistentTileSchedulerIntelPVCStreamK(Params const& params_) : scheduler_params(params_) { + current_work_linear_idx_ = uint64_t(BlockIdxX()) + uint64_t(BlockIdxY()) * uint64_t(GridDimX()); + } + + CUTLASS_DEVICE + WorkTileInfo + get_current_work() const { + return get_current_work_for_linear_idx(current_work_linear_idx_, scheduler_params); + } + + CUTLASS_DEVICE + static WorkTileInfo + get_current_work_for_linear_idx(uint64_t linear_idx, Params const& params) { + // The maximum number of work units is units_per_problem_ * splits_. + // The multiplication by splits_ is used for handling split-K, in which + // units_per_problem_ is equal to the total number of output tiles. To account + // for the fact that we have splits_ peers per output tile, we multiply this + // value by splits_. For stream-K, this multiplication ends up being a no-op + // because splits_ is set to 1 for stream-K. + if(linear_idx >= (params.units_per_problem_ * params.divmod_splits_.divisor)) { + // Invalid work. Return an empty result. + return WorkTileInfo::invalid_work_tile(); + } + + WorkTileInfo work_tile_info; + assign_work(params, linear_idx, work_tile_info); + return work_tile_info; + } + + // Returns whether the current work_tile_info passed in should continue to be used. This + // occurs only in the stream-K decomposition with stream-K work units, which encompass + // work over multiple output tiles. If the current work_tile_info should continue to be + // used, it is updated to advance to the next output tile it should cover. + CUTLASS_DEVICE + bool + continue_current_work(WorkTileInfo& work_tile_info) const { + return continue_current_work_for_linear_idx( + current_work_linear_idx_, work_tile_info, scheduler_params); + } + + CUTLASS_DEVICE + static bool + continue_current_work_for_linear_idx( + uint64_t linear_idx, + WorkTileInfo& work_tile_info, + Params const& params) { + + work_tile_info.k_tile_remaining -= work_tile_info.k_tile_count; + + if (work_tile_info.k_tile_remaining == 0) { + return false; + } + assign_work(params, linear_idx, work_tile_info); + return work_tile_info.is_valid(); + } + + CUTLASS_DEVICE + void + advance_to_next_work(uint32_t advance_count = 1) { + current_work_linear_idx_ += uint64_t(GridDimX()) * uint64_t(GridDimY()) * uint64_t(GridDimZ()) * uint64_t(advance_count); + } + + // Given the inputs, computes the total number of output blocks this problem will compute over + // Note that this is only the logical size of our grid, not the physical grid we will actually launch. + template + CUTLASS_HOST_DEVICE static + dim3 + get_tiled_cta_shape_mnl(ProblemShape problem_shape_mnkl, TileShape cta_shape) { + return Params::get_tiled_cta_shape_mnl(to_gemm_coord(problem_shape_mnkl), to_gemm_coord(cta_shape)); + } + + // Given the cluster shape, computes the physical grid we should launch. + template + CUTLASS_HOST_DEVICE static + dim3 + get_grid_shape( + ProblemShape problem_shape, + TileShape tile_shape, + KernelHardwareInfo hw_info) { + + auto problem_shape_mnkl = cute::append<4>(problem_shape, cute::Int<1>{}); + dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape); + + return Params::get_grid_shape( + problem_blocks, + hw_info + ); + } + + // Returns whether fixup is needed for `work_tile_info`. + CUTLASS_HOST_DEVICE + static bool + requires_fixup(Params const& params, WorkTileInfo const& work_tile_info) { + // Fixup is not needed for invalid or data-parallel tiles + return work_tile_info.is_valid() && work_tile_info.k_tile_count != params.divmod_tiles_per_output_tile_.divisor; + } + + // Performs the reduction across splits for a given output tile. +/*template + CUTLASS_DEVICE + static void + fixup( + Params const& params, + WorkTileInfo const& work_tile_info, + FrgTensorC& accumulators, + uint32_t num_barriers, + uint32_t barrier_idx) { + static constexpr uint32_t Offset = static_cast(cutlass::arch::ReservedNamedBarriers::StreamkBarrier0); + static constexpr uint32_t MaxNumNamedBarriers = 2; + using BarrierManager = NamedBarrierManager; + return fixup_helper( + params, work_tile_info, accumulators, num_barriers, barrier_idx); + } + + // Helper for performing the reduction across splits for a given output tile. + template + CUTLASS_DEVICE + static void + fixup_helper( + Params const& params, + WorkTileInfo const& work_tile_info, + FrgTensorC& accumulators, + uint32_t num_barriers, + uint32_t barrier_idx, + uint32_t num_accumulator_mtxs = 1) { + + using ElementAccumulator = typename FrgTensorC::value_type; + + if (!requires_fixup(params, work_tile_info)) { + return; + } + auto tile_idx = output_tile_index(params, work_tile_info); + + // Index of the lock on which to wait + auto lock_idx = (tile_idx * num_barriers) + barrier_idx; + + auto reduction_tile_idx = tile_idx; + auto [first_peer_id, my_peer_id, last_peer_id] = tile_peer_range(params, tile_idx, static_cast(work_tile_info.K_idx)); + auto reduction_peer_offset = 0; + if (params.requires_separate_reduction()) { + // If separate reduction is to be performed, each stream-K unit writes its partials + // to a separate portion of the workspace. There are as many of these portions as there + // are peers for a given output tile, so we multiply the tile index by the maximum peer count. + reduction_tile_idx *= Params::max_peers_per_tile(params.sk_units_, params.sk_tiles_); + reduction_peer_offset = my_peer_id * cute::size<0>(TileShape{}) * cute::size<1>(TileShape{}); + } + + // Reductions use BlockStripedReduce with a width of BarrierManager::ThreadCount under the hood. + // Thus, the start of the reduction space is the same across all threads in a warp group. + int reduction_offset = + (cute::size<0>(TileShape{}) * cute::size<1>(TileShape{}) * reduction_tile_idx * num_accumulator_mtxs) + + reduction_peer_offset + + (size(accumulators) * barrier_idx * BarrierManager::ThreadCount); + + ElementAccumulator* group_reduction_workspace = reinterpret_cast(params.reduction_workspace_) + reduction_offset; + + using AccumulatorArrayT = Array; + using BlockStripedReduceT = BlockStripedReduce; + + AccumulatorArrayT* reduction_workspace_array = reinterpret_cast(group_reduction_workspace); + AccumulatorArrayT* accumulator_array = reinterpret_cast(accumulators.data()); + + int barrier_group_thread_idx = ThreadIdxX() % BarrierManager::ThreadCount; + + // The number of tiles for which reduction is required is either: + // (a) the total number of output tiles (in the case of split-K) + // (b) the number of stream-K tiles (potentially multiplied by peer count if using separate reduction) + // To calculate the total number of output tiles in the split-K case, we + // note that, in the split-K case, the units_per_problem_ member of Params will be + // the total number of output tiles. + uint32_t reduction_tiles = 0; + if (params.divmod_splits_.divisor > 1) { + reduction_tiles = params.units_per_problem_; + } + else if (params.requires_separate_reduction()) { + reduction_tiles = params.sk_tiles_ * Params::max_peers_per_tile(params.sk_units_, params.sk_tiles_); + } + else { + reduction_tiles = params.sk_tiles_; + } + + auto reduction_workspace_size = Params::get_reduction_workspace_size( + reduction_tiles, to_gemm_coord(TileShape{}), sizeof_bits::value, num_accumulator_mtxs); + BarrierType* lock_workspace = reinterpret_cast( + reinterpret_cast(params.reduction_workspace_) + reduction_workspace_size); + + if (work_tile_info.is_reduction_unit()) { + plus add_fragments; + auto peer_offset = size(accumulators) * num_barriers * BarrierManager::ThreadCount; + + // Wait until the peers collaborating on this output tile have all written + // their accumulators to workspace. + uint32_t num_peers = last_peer_id - first_peer_id + 1; + BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, num_peers); + + // Load the first peer's data + BlockStripedReduceT::load(*accumulator_array, reduction_workspace_array, barrier_group_thread_idx); + + for (int i = 1; i < num_peers; ++i) { + // Load peer fragment + AccumulatorArrayT addend_fragment; + auto peer_reduction_workspace = reinterpret_cast(group_reduction_workspace + (i * peer_offset)); + + BlockStripedReduceT::load(addend_fragment, peer_reduction_workspace, barrier_group_thread_idx); + + // Add peer fragment + *accumulator_array = add_fragments(*accumulator_array, addend_fragment); + } + } + else if (!compute_epilogue(work_tile_info, params)) { + if (params.requires_separate_reduction() || work_tile_info.K_idx == 0) { + // The first peer initializes the workspace partials in the non-separate-reduction case, + // and all peers write to their own location in workspace when using separate reduction + BlockStripedReduceT::store(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx); + } + else { + // Wait until the preceding split added its accumulators + BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx); + + // Perform reduction in workspace + BlockStripedReduceT::reduce(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx); + } + + // If separate reduction is being performed, each participating stream-K unit increments the barrier + // by only 1. Otherwise, increment by the K tile count that this unit has processed. + int32_t increment = params.requires_separate_reduction() ? 1 : work_tile_info.k_tile_count; + + // Signal our arrival + BarrierManager::arrive_inc(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, increment); + } + else { + if (params.reduction_mode_ == ReductionMode::Deterministic) { + // Wait until the preceding split added its accumulators + BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx); + } + else { + // Wait unitl the first split has stored its accumulators + BarrierManager::wait_lt(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, 1); + } + + // The block computing the final split for the tile adds previously-reduced partials + // to its accumulators and computes the epilogue. + BlockStripedReduceT::load_add(*accumulator_array, reduction_workspace_array, barrier_group_thread_idx); + } + } + + // Returns whether the block assigned this work should compute the epilogue for the corresponding + // output tile. For the case of stream-K, this should only occur if the work is marked as the final split. + CUTLASS_HOST_DEVICE + static bool + compute_epilogue(WorkTileInfo const& work_tile_info, Params const& params) { + // `is_final_split` will be set to `true` for the following scenarios, all of which must compute the epilogue: + // 1. The tile is computed in data-parallel mode + // 2. The tile is computed in split-/stream-K mode and this work unit represents the final split of the tile + // 3. The tile is computed in split-/stream-K mode and separate reduction is used, and this is a separate reduction unit + return work_tile_info.is_valid() && + (work_tile_info.is_final_split(params.divmod_tiles_per_output_tile_.divisor) && + !params.requires_separate_reduction()) || work_tile_info.is_separate_reduction; + } + + // Returns the linearized index of the output tile corresponding to the tile with offset [L, M, K] + CUTLASS_DEVICE + static int + output_tile_index(Params const& params, WorkTileInfo const& work_tile_info) { + uint64_t linear_idx_in_batch = UnderlyingScheduler::get_linear_idx_from_m_and_n( + work_tile_info.M_idx, work_tile_info.N_idx, + params.divmod_cluster_shape_major_, + params.divmod_cluster_shape_minor_, + params.divmod_cluster_blk_major_, + params.log_swizzle_size_, + params.raster_order_ + ); + + uint64_t tiles_mn = params.divmod_batch_.divisor; + return tiles_mn * work_tile_info.L_idx + linear_idx_in_batch; + }*/ + + template + static size_t + get_workspace_size( + Arguments const& args, + ProblemShape problem_shape, + KernelHardwareInfo const& hw_info) { + + auto problem_shape_mnkl = cute::append<4>(problem_shape, 1); + + TileShape tile_shape; + + dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape); + uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{}))); + + return Params::get_workspace_size( + problem_blocks, + k_tile_per_output_tile, + to_gemm_coord(tile_shape), + hw_info, + args.splits, + args.decomposition_mode, + sizeof_bits::value, + sizeof_bits::value + ); + } + + template + static cutlass::Status + initialize_workspace( + Arguments const& args, + void* workspace, + ProblemShape const& problem_shape, + KernelHardwareInfo const& hw_info) { + + auto problem_shape_mnkl = cute::append<4>(problem_shape, 1); + + TileShape tile_shape; + + dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape); + uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{}))); + + return Params::initialize_workspace( + workspace, + problem_blocks, + k_tile_per_output_tile, + to_gemm_coord(tile_shape), + hw_info, + args.splits, + args.decomposition_mode, + sizeof_bits::value, + sizeof_bits::value + ); + } + + template + CUTLASS_HOST_DEVICE + static int + get_work_k_tile_count(WorkTileInfo const& work_tile_info, ProblemShape, TileShape) { + return work_tile_info.k_tile_count; + } + + CUTLASS_HOST_DEVICE + static uint32_t + get_work_k_tile_start(WorkTileInfo const& work_tile_info) { + return work_tile_info.K_idx; + } + + // Kernel helper function to get next work tile + CUTLASS_DEVICE + auto + fetch_next_work(WorkTileInfo work_tile_info) { + if (continue_current_work(work_tile_info)) { + return work_tile_info; + } + + advance_to_next_work(); + return get_current_work(); + } + + // Returns the initial work tile info that will be computed over + CUTLASS_DEVICE + WorkTileInfo + initial_work_tile_info() { + return get_current_work(); + } + +private: + // Sets the current stream-K work to compute within work_tile_info. If new_unit is true, work_tile_info + // is populated as a new unit of work. Otherwise, state existing in work_tile_info (e.g., remaining + // iterations) is used to find the next tile in the current work unit. +/*CUTLASS_DEVICE + static void + assign_work( + Params const& params, + uint64_t linear_idx, + WorkTileInfo& work_tile_info) { + + auto [cta_m_in_cluster_, cta_n_in_cluster_, _] = cute::block_id_in_cluster(); + uint64_t cta_m_in_cluster = static_cast(cta_m_in_cluster_); + uint64_t cta_n_in_cluster = static_cast(cta_n_in_cluster_); + uint64_t output_tile_id = linear_idx; + if (linear_idx >= params.units_per_problem_ * params.divmod_splits_.divisor) { + // Separate-reduction work + auto cluster_size = params.get_cluster_size(); + // Divide up the linearized separate reduction units into clusters + auto cluster_linear_reduction_unit_idx = params.div_cluster_size((linear_idx - params.units_per_problem_)); + uint64_t cluster_tile_idx, epi_subtile_idx; + params.divmod_epilogue_subtile_(cluster_tile_idx, epi_subtile_idx, cluster_linear_reduction_unit_idx); + // Bring the linearized tile ID back into the space of tiles, rather than clusters + output_tile_id = cluster_tile_idx * cluster_size; + + work_tile_info.setup_separate_reduction(epi_subtile_idx); + } + else if (linear_idx >= params.sk_units_ && params.divmod_splits_.divisor == 1) { + // Data-parallel work + output_tile_id = linear_idx - params.sk_units_ + params.sk_tiles_; + work_tile_info.K_idx = 0; + work_tile_info.k_tile_count = params.divmod_tiles_per_output_tile_.divisor; + work_tile_info.k_tile_remaining = params.divmod_tiles_per_output_tile_.divisor; + } + else { + // In the CUTLASS 2.x implementation of stream K, stream-K work is assigned to each stream-K + // threadblock individually. For the most part, the set of K iterations corresponding to stream-K + // work was divided amongst stream-K threadblocks, and a threadblock determined which tile + // it would compute a (potentially-partial) output tile for based on the space of k iterations + // assigned to it. This often results in stream-K threadblocks processing tiles with different + // offsets in the K dimension from one another. This can reduce locality, but is lmitied to the + // (generally few) waves of threadblocks assigned to compute stream-K work. + // + // With the introduction of threadblock clusters, there is additional benefit to maintaining + // locality in the K dimension: shared portions of operands can be multicasted to threadblocks + // within a cluster. Thus, we would like to ensure that the assignment of stream-K work to + // threadblocks respects the ability to perform multicasting. + // + // To do so, we divide up the linearized stream-K units into clusters and share the same K + // offsets for work within clusters. + + auto cluster_linear_work_idx = params.div_cluster_size(linear_idx); + + uint64_t group_idx; + params.divmod_sk_groups_(cluster_linear_work_idx, group_idx, cluster_linear_work_idx); + + // Determine whether we are in a "big group" that will process an additional + // stream-K cluster tile. + auto sk_cluster_tiles = params.div_cluster_size(params.sk_tiles_); + auto sk_cluster_tiles_in_group = params.divmod_sk_groups_.divide(sk_cluster_tiles); + if (group_idx < params.big_groups_) { + ++sk_cluster_tiles_in_group; + } + + // Determine whether we are in a "big unit" within the group, that will process + // an additional K chunk in the group. + auto sk_tiles_in_group = sk_cluster_tiles_in_group * params.get_cluster_size(); + auto k_tiles_in_group = sk_tiles_in_group * params.divmod_tiles_per_output_tile_.divisor; + auto k_tiles_per_unit_in_group = params.divmod_sk_units_per_group_.divide(k_tiles_in_group); + auto big_units_in_group = params.div_cluster_size( + k_tiles_in_group - (k_tiles_per_unit_in_group * params.divmod_sk_units_per_group_.divisor)); + + uint64_t split; + params.divmod_clusters_mnl_(split, cluster_linear_work_idx, cluster_linear_work_idx); + + bool is_split_k = params.divmod_splits_.divisor > 1; + auto big_unit_cmp_lhs = is_split_k ? split : cluster_linear_work_idx; + auto big_unit_cmp_rhs = is_split_k ? params.big_units_ : big_units_in_group; + auto linear_idx_mult = is_split_k ? params.divmod_tiles_per_output_tile_.divisor : k_tiles_per_unit_in_group; + auto k_tiles_per_split = is_split_k ? params.divmod_k_tiles_per_sk_unit_.divisor : k_tiles_per_unit_in_group; + + // Determine the starting k iteration computed by this stream-K work unit + uint32_t unit_iter_start = (linear_idx_mult * cluster_linear_work_idx) + + (k_tiles_per_split * split); + + // Adjust the starting position and number of k iterations for "big units," which + // compute one extra iteration. If there are any big units, they will be the first + // in the linearized ID space. + auto k_tiles_in_my_split = k_tiles_per_split; + if (big_unit_cmp_lhs < big_unit_cmp_rhs) { + // Since the "big units" are the first units in the linearized ID space, each + // of the units preceding this big unit computed one extra iteration. Thus, + // we must offset our start iteration by the number of units that precede + // the current unit in the linearized ID space. + unit_iter_start += big_unit_cmp_lhs; + ++k_tiles_in_my_split; + } + else { + // Increment by one for each of the big clusters (since all big units precede this unit) + unit_iter_start += big_unit_cmp_rhs; + } + + if (!is_split_k) { + // Adjust the unit starting position and number of tiles to avoid + // computing splits of size less than min_iters_per_sk_unit_ + int unused, start_tile_k_tile; + params.divmod_tiles_per_output_tile_(unused, start_tile_k_tile, unit_iter_start); + if (start_tile_k_tile < Params::min_iters_per_sk_unit_) { + // Starting K tile is in range [0, Params::min_iters_per_sk_unit_), which means that another + // stream-K unit will be computing a split with fewer than Params::min_iters_per_sk_unit_ K tiles. + // Adjust our work to take over these K tiles. + unit_iter_start -= start_tile_k_tile; + k_tiles_in_my_split += start_tile_k_tile; + } + else if (start_tile_k_tile > (params.divmod_tiles_per_output_tile_.divisor - Params::min_iters_per_sk_unit_)) { + // Starting K tile is within the final Params::min_iters_per_sk_unit_ K tiles of some output tile, + // which means that this unit will compute a split with fewer than Params::min_iters_per_sk_unit_ K tiles. + // Adjust our work to shed these K tiles to a neighboring stream-K unit that will compute more consecutive K tiles. + auto adjustment_tiles = (params.divmod_tiles_per_output_tile_.divisor - start_tile_k_tile); + unit_iter_start += adjustment_tiles; + k_tiles_in_my_split -= adjustment_tiles; + } + else if (params.ktile_start_alignment_count == 2 && start_tile_k_tile % 2 != 0) { + // ktile for each SM start from even number + // If start from odd number ktile within the output tile + // now start at the ktile one before my initial ktile start (take one ktile from prev sm) + // if end on odd number ktile within the output tile + // now end at ktile that one before my ktile end (give one ktile to next sm) + unit_iter_start -= 1; + k_tiles_in_my_split += 1; + } + } + + if (work_tile_info.k_tile_count == 0) { + // This is a new unit + + if (!is_split_k) { + // + // Adjust the unit ending position and number of tiles to avoid + // computing splits of size less than min_iters_per_sk_unit_ + // + + // Begin by assuming that no adjustment is needed + auto initial_unit_iter_end = unit_iter_start + k_tiles_in_my_split; + + int unused, end_tile_k_tile; + params.divmod_tiles_per_output_tile_(unused, end_tile_k_tile, initial_unit_iter_end); + + if (end_tile_k_tile < Params::min_iters_per_sk_unit_) { + // Ending K tile is within the first Params::min_iters_per_sk_unit_ K tiles of some output tile, + // which means that this unit will compute a split with fewer than Params::min_iters_per_sk_unit_ K tiles. + // Adjust our work to shed these K tiles to a neighboring stream-K unit that will compute more consecutive K tiles. + k_tiles_in_my_split -= end_tile_k_tile; + } + else if (end_tile_k_tile > (params.divmod_tiles_per_output_tile_.divisor - Params::min_iters_per_sk_unit_)) { + // Ending K tile is within the final Params::min_iters_per_sk_unit_ K tiles of some output tile, + // which means that some other unit will compute a split with fewer than Params::min_iters_per_sk_unit_ K tiles. + // Adjust our work to take on these K tiles. + k_tiles_in_my_split += (params.divmod_tiles_per_output_tile_.divisor - end_tile_k_tile); + } + else if (params.ktile_start_alignment_count == 2 && end_tile_k_tile % 2 != 0) { + // ktile for each SM start from even number + // If start from odd number ktile within the output tile + // now start at the ktile one before my initial ktile start (take one ktile from prev sm) + // If end on odd number ktile within the output tile, + // now end at ktile that one before my ktile end (give one ktile to next sm) + k_tiles_in_my_split -= 1; + } + } + + work_tile_info.k_tile_remaining = k_tiles_in_my_split; + } + + uint32_t unit_iter_end = unit_iter_start + work_tile_info.k_tile_remaining - 1; + + // Find the output tile corresponding to the final k tile covered by this + // work unit. Stream-K work units will work backwards in terms of the tiles they + // are responsible computing. This is beneficial because the final (partial) + // tile computed by a stream-K block is typically the beginning of the output + // tile, while the beginning (partial) tile is typically the ending of another + // output tile. Since ending portions of an output tile must reduce across + // other work units computing portions of that output tile, it is preferable + // for them to be computed later, so as to reduce the likelihood of blocking + // on other work. + + auto output_tile_id_in_group = params.divmod_tiles_per_output_tile_.divide(unit_iter_end); + uint32_t output_tile_iter_start = output_tile_id_in_group * params.divmod_tiles_per_output_tile_.divisor; + uint32_t output_tile_iter_end = output_tile_iter_start + params.divmod_tiles_per_output_tile_.divisor; + + // Convert the output tile from the linearized space within each group to the + // overall linearized space. + output_tile_id = (output_tile_id_in_group * params.divmod_sk_groups_.divisor) + group_idx; + + // Bring the linearized tile ID back into the space of tiles, rather than clusters + output_tile_id *= params.get_cluster_size(); + + // The final linearized tile ID is in units of the cluster dimension over which we rasterize. + if (params.raster_order_ == RasterOrder::AlongN) { + output_tile_id += cta_n_in_cluster * params.divmod_cluster_shape_minor_.divisor; + } + else { + output_tile_id += cta_m_in_cluster * params.divmod_cluster_shape_minor_.divisor; + } + + // The unit's starting k iteration in the current tile is either the starting + // iteration for the tile as a whole, or the starting k iteration for the unit + // as a whole (if the latter is greater than the former). + uint32_t tile_iter_start = max(output_tile_iter_start, unit_iter_start); + + // Similarly, the unit's ending k iteration (exclusive) is either the end of + // the current tile it is assigned, or the ending iteration of the unit as a whole + // (if the latter is less than the former). + uint32_t tile_iter_end = min(output_tile_iter_end, unit_iter_end + 1); + + // Set the k offset to be the starting k tile for this output tile + work_tile_info.K_idx = static_cast(tile_iter_start - output_tile_iter_start); + work_tile_info.k_tile_count = tile_iter_end - tile_iter_start; + } + + uint64_t work_idx_l, remainder; + params.divmod_batch_(work_idx_l, remainder, output_tile_id); + + uint64_t cta_per_grid_dim = params.divmod_cluster_shape_minor_.divide(remainder); + + auto [work_idx_m, work_idx_n] = UnderlyingScheduler::get_work_idx_m_and_n( + cta_per_grid_dim, + params.divmod_cluster_shape_major_, + params.divmod_cluster_shape_minor_, + params.divmod_cluster_blk_major_, + params.log_swizzle_size_, + params.raster_order_ + ); + + // Set the M, N, and L block offsets + work_tile_info.M_idx = work_idx_m; + work_tile_info.N_idx = work_idx_n; + work_tile_info.L_idx = static_cast(work_idx_l); + } + + // Returns the starting and ending peer ID of this tile + CUTLASS_HOST_DEVICE + static auto + tile_peer_range(Params const& params, uint32_t tile_idx, uint32_t cur_k_tile) { + auto tile_idx_in_cluster_path = params.div_cluster_size(tile_idx); + auto start_k_tile = params.divmod_tiles_per_output_tile_.divisor * tile_idx_in_cluster_path; + auto end_k_tile = start_k_tile + params.divmod_tiles_per_output_tile_.divisor - 1; + auto big_unit_k_tiles = params.big_units_ * (params.divmod_k_tiles_per_sk_unit_.divisor + 1); + + auto adjust_unit = [&](uint32_t k_tile, uint32_t unit_idx, uint32_t k_tiles_per_unit) { + auto unit_k_start = unit_idx * k_tiles_per_unit; + auto unit_k_end = unit_k_start + k_tiles_per_unit; + if (k_tile - start_k_tile < Params::min_iters_per_sk_unit_ && + unit_k_end - start_k_tile < Params::min_iters_per_sk_unit_) { + // k_tile is within the first min_iters_per_sk_unit_ K tiles of this output tile, + // and the stream-K unit computes fewer than min_iters_per_sk_unit_ K tiles for this + // output tile. This work will thus be subsumed by the next stream-K unit. + ++unit_idx; + } + + if (end_k_tile + 1 - k_tile < Params::min_iters_per_sk_unit_ && + end_k_tile + 1 - unit_k_start < Params::min_iters_per_sk_unit_) { + // k_tile is within the last min_iters_per_sk_unit_ K tiles of this output tile, + // and the stream-K unit computes fewer than min_iters_per_sk_unit_ K tiles for this + // output tile. This work will thus be subsumed by the previous stream-K unit. + --unit_idx; + } + + return unit_idx; + }; + + // Lambda to find the ID of the stream-K unit that computes this K tile + auto find_unit = [&](uint32_t k_tile) { + if (k_tile < big_unit_k_tiles) { + // The tile is within the "big unit range" + auto unit_idx = params.divmod_k_tiles_per_sk_big_unit_.divide(k_tile); + return static_cast(adjust_unit(k_tile, unit_idx, params.divmod_k_tiles_per_sk_big_unit_.divisor)); + } + else { + // The tile is after the "big unit range." Account for this by finding the "normal unit" + // that it belongs to, and then offsetting by the number of big units + auto unit_idx = params.divmod_k_tiles_per_sk_unit_.divide(k_tile - big_unit_k_tiles) + params.big_units_; + return static_cast(adjust_unit(k_tile, unit_idx, params.divmod_k_tiles_per_sk_unit_.divisor)); + } + }; + + return cute::make_tuple(find_unit(start_k_tile), find_unit(cur_k_tile), find_unit(end_k_tile)); + }*/ +}; + +} // namespace cutlass::gemm::kernel::detail diff --git a/include/cutlass/gemm/kernel/tile_scheduler.hpp b/include/cutlass/gemm/kernel/tile_scheduler.hpp index 9835e37fc8..e354e7a48c 100644 --- a/include/cutlass/gemm/kernel/tile_scheduler.hpp +++ b/include/cutlass/gemm/kernel/tile_scheduler.hpp @@ -40,6 +40,9 @@ #include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp" #include "cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp" #include "cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp" +#if defined (SYCL_INTEL_TARGET) +#include "cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp" +#endif //////////////////////////////////////////////////////////////////////////////// namespace cutlass::gemm { @@ -127,6 +130,19 @@ struct TileSchedulerSelector< using Scheduler = PersistentTileSchedulerSm90StreamK; }; +template < + class TileShape, + class ClusterShape +> +struct TileSchedulerSelector< + StreamKScheduler, + arch::IntelPVC, + TileShape, + ClusterShape + > { + using Scheduler = PersistentTileSchedulerIntelPVCStreamK; +}; + template < class TileShape, class ClusterShape diff --git a/include/cutlass/workspace.h b/include/cutlass/workspace.h index 31c48435b1..bb74826805 100644 --- a/include/cutlass/workspace.h +++ b/include/cutlass/workspace.h @@ -61,7 +61,9 @@ zero_workspace(void* workspace, size_t workspace_size, cudaStream_t stream = nul CUTLASS_TRACE_HOST(" clearing workspace"); -#if defined(CUTLASS_ENABLE_CUDA_HOST_ADAPTER) && CUTLASS_ENABLE_CUDA_HOST_ADAPTER +#if defined CUTLASS_ENABLE_SYCL + syclcompat::memset_async(workspace, 0, workspace_size); +#elif defined(CUTLASS_ENABLE_CUDA_HOST_ADAPTER) && CUTLASS_ENABLE_CUDA_HOST_ADAPTER // // Use the cuda host adapter // From ce8b3a27be55fbc77eb0674b08bd5a48c38dc353 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Thu, 29 Aug 2024 00:20:09 +0100 Subject: [PATCH 02/30] fixed starting index calculation --- include/cutlass/barrier.h | 22 ++++++++++--------- ...rsistent_tile_scheduler_params_streamk.hpp | 15 +++++++------ .../intel_pvc_tile_scheduler_streamk.hpp | 16 +++++++++----- 3 files changed, 30 insertions(+), 23 deletions(-) diff --git a/include/cutlass/barrier.h b/include/cutlass/barrier.h index 7d6a0ec902..9cf7343976 100644 --- a/include/cutlass/barrier.h +++ b/include/cutlass/barrier.h @@ -98,11 +98,10 @@ struct GenericBarrier { int state = 0; #ifdef SYCL_INTEL_TARGET - using atomicT = sycl::atomic_ref; - auto atm = atomicT(*ptr); - return atm.load(); + sycl::access::address_space::global_space>(*ptr); + return atm.load(sycl::memory_order::acquire); #elif defined (__CUDA_ARCH__ >= 700) /// SM70 and newer use memory consistency qualifiers @@ -146,8 +145,8 @@ struct GenericBarrier { if (thread_idx == 0) { // Spin-loop - // #pragma unroll 1 - // while(ld_acquire(flag_ptr) < count) {} + #pragma unroll 1 + while(ld_acquire(flag_ptr) < count) {} } Sync::sync(); @@ -161,9 +160,10 @@ struct GenericBarrier { if (thread_idx == 0) { + //printf("BlockID: %lu | wait_eq: %d\n", BlockIdxY(), val); // Spin-loop - // #pragma unroll 1 - // while(ld_acquire(flag_ptr) != val) {} + #pragma unroll 1 + while(ld_acquire(flag_ptr) != val) {break;} } Sync::sync(); } @@ -175,9 +175,10 @@ struct GenericBarrier { if (thread_idx == 0) { + //printf("BlockID: %lu | wait_eq_reset: %d\n", BlockIdxY(), val); // Spin-loop - // #pragma unroll 1 - // while(atomicCAS(flag_ptr, val, 0) != val) {} + #pragma unroll 1 + while(atomicCAS(flag_ptr, val, 0) != val) {} } Sync::sync(); @@ -193,6 +194,7 @@ struct GenericBarrier { if (thread_idx == 0) { + //printf("BlockID: %lu | arrive_inc_val: %d\n", BlockIdxY(), val); red_release(flag_ptr, val); } } diff --git a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp index 478c6dbb3a..0d22afeb8e 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp @@ -140,7 +140,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { static constexpr uint32_t min_iters_per_sk_unit_ = 8u; // Maximum number of groups of stream-K units - // static constexpr uint32_t max_sk_groups_ = 8u; + static constexpr uint32_t max_sk_groups_ = 1u; // ktile start from even for each cta uint32_t ktile_start_alignment_count { 1u }; @@ -351,7 +351,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { groups = fallback_groups; }*/ - uint32_t groups = 1; + uint32_t groups = max_sk_groups_; auto sk_units_per_group = sk_units / groups; @@ -427,8 +427,6 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { cute::tuple get_work_idx_m_and_n( uint64_t blk_per_grid_dim, - FastDivmodU64Pow2 const& divmod_cluster_shape_major, - FastDivmodU64Pow2 const& divmod_cluster_shape_minor, FastDivmodU64 const& divmod_cluster_blk_major) { uint64_t m_idx, n_idx; @@ -471,9 +469,12 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { dim3 problem_blocks, KernelHardwareInfo hw_info ) { - uint32_t available_sms = hw_info.sm_count / 8; - uint32_t dimx = ((problem_blocks.x * problem_blocks.y) + available_sms - 1) / available_sms; - return dim3{available_sms, dimx, problem_blocks.z}; + uint32_t available_sms = 32;//hw_info.sm_count / 8; + // printf("available_sms: %d\n", available_sms); + auto possibly_truncate = [&](int x, int y) { + return static_cast(platform::min(x, y)); + }; + return dim3{1, possibly_truncate(available_sms, problem_blocks.x * problem_blocks.y * problem_blocks.z), 1}; } // Returns the number of stream-K tiles that will be computed amongst `output_tiles` total diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp index 91174cfc89..e1f5edb64c 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp @@ -185,7 +185,8 @@ class PersistentTileSchedulerIntelPVCStreamK { CUTLASS_HOST_DEVICE PersistentTileSchedulerIntelPVCStreamK(Params const& params_) : scheduler_params(params_) { - current_work_linear_idx_ = uint64_t(BlockIdxX()) + uint64_t(BlockIdxY()) * uint64_t(GridDimX()); + // current_work_linear_idx_ = uint64_t(BlockIdxX()) + uint64_t(BlockIdxY()) * uint64_t(GridDimX()); + current_work_linear_idx_ = uint64_t(BlockIdxY()); } CUTLASS_DEVICE @@ -580,8 +581,8 @@ CUTLASS_DEVICE // auto big_units_in_group = params.div_cluster_size( // k_tiles_in_group - (k_tiles_per_unit_in_group * params.divmod_sk_units_per_group_.divisor)); - // uint64_t split; - // params.divmod_clusters_mnl_(split, cluster_linear_work_idx, cluster_linear_work_idx); + uint64_t split; + params.divmod_sk_units_per_group_(split, output_tile_id, output_tile_id); bool is_split_k = params.divmod_splits_.divisor > 1; auto big_unit_cmp_lhs = output_tile_id; @@ -590,7 +591,7 @@ CUTLASS_DEVICE auto k_tiles_per_split = is_split_k ? params.divmod_k_tiles_per_sk_unit_.divisor : k_tiles_per_unit_in_group; // Determine the starting k iteration computed by this stream-K work unit - uint32_t unit_iter_start = (linear_idx_mult * linear_idx) + k_tiles_per_split; + uint32_t unit_iter_start = (linear_idx_mult * linear_idx) + (k_tiles_per_split * split); // Adjust the starting position and number of k iterations for "big units," which // compute one extra iteration. If there are any big units, they will be the first @@ -733,8 +734,6 @@ CUTLASS_DEVICE auto [work_idx_m, work_idx_n] = Params::get_work_idx_m_and_n( cta_per_grid_dim, - params.divmod_cluster_shape_major_, - params.divmod_cluster_shape_minor_, params.divmod_cluster_blk_major_ ); @@ -742,6 +741,11 @@ CUTLASS_DEVICE work_tile_info.M_idx = work_idx_m; work_tile_info.N_idx = work_idx_n; work_tile_info.L_idx = work_idx_l; + + // if(linear_idx >= 32 && ThreadIdxX() == 0) + // printf("BlockID: %lu | k_tile_count: %d | M_idx: %lu | N_idx: %lu | K_idx: %lu | L_idx: %lu | ctas_per_grid_dim: %lu | output_tile_id: %lu\n", + // BlockIdxY(), work_tile_info.k_tile_count, work_tile_info.M_idx, work_tile_info.N_idx, work_tile_info.K_idx, + // work_tile_info.L_idx, remainder, output_tile_id); } // Returns the starting and ending peer ID of this tile From 9599d62b71dabb32f0aa8a8557527be03b335c79 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Thu, 29 Aug 2024 01:41:01 +0100 Subject: [PATCH 03/30] Fixed barrier count update --- include/cutlass/barrier.h | 5 +---- .../gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp | 10 +++++----- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/include/cutlass/barrier.h b/include/cutlass/barrier.h index 9cf7343976..f561ec664e 100644 --- a/include/cutlass/barrier.h +++ b/include/cutlass/barrier.h @@ -160,10 +160,9 @@ struct GenericBarrier { if (thread_idx == 0) { - //printf("BlockID: %lu | wait_eq: %d\n", BlockIdxY(), val); // Spin-loop #pragma unroll 1 - while(ld_acquire(flag_ptr) != val) {break;} + while(ld_acquire(flag_ptr) != val) {} } Sync::sync(); } @@ -175,7 +174,6 @@ struct GenericBarrier { if (thread_idx == 0) { - //printf("BlockID: %lu | wait_eq_reset: %d\n", BlockIdxY(), val); // Spin-loop #pragma unroll 1 while(atomicCAS(flag_ptr, val, 0) != val) {} @@ -194,7 +192,6 @@ struct GenericBarrier { if (thread_idx == 0) { - //printf("BlockID: %lu | arrive_inc_val: %d\n", BlockIdxY(), val); red_release(flag_ptr, val); } } diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp index e1f5edb64c..af929c2b87 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp @@ -294,7 +294,7 @@ template uint32_t num_barriers = 1, uint32_t barrier_idx = 0) { static constexpr uint32_t Offset = static_cast(cutlass::arch::ReservedNamedBarriers::StreamkBarrier0); - static constexpr uint32_t MaxNumNamedBarriers = 2; + static constexpr uint32_t MaxNumNamedBarriers = 1; using BarrierManager = NamedBarrierManager; return fixup_helper( params, work_tile_info, subgroup_id, accumulators, num_barriers, barrier_idx); @@ -371,7 +371,7 @@ template } else { // Wait until the preceding split added its accumulators - BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx); + BarrierManager::wait_eq(barrier_idx, lock_workspace, ThreadIdxX(), lock_idx, work_tile_info.K_idx); // Perform reduction in workspace BlockStripedReduceT::reduce(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx); @@ -382,16 +382,16 @@ template int32_t increment = work_tile_info.k_tile_count; // Signal our arrival - BarrierManager::arrive_inc(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, increment); + BarrierManager::arrive_inc(barrier_idx, lock_workspace, ThreadIdxX(), lock_idx, increment); } else { if (params.reduction_mode_ == ReductionMode::Deterministic) { // Wait until the preceding split added its accumulators - BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx); + BarrierManager::wait_eq(barrier_idx, lock_workspace, ThreadIdxX(), lock_idx, work_tile_info.K_idx); } else { // Wait unitl the first split has stored its accumulators - BarrierManager::wait_lt(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, 1); + BarrierManager::wait_lt(barrier_idx, lock_workspace, ThreadIdxX(), lock_idx, 1); } // The block computing the final split for the tile adds previously-reduced partials From 76e54f9d3d26cf52116a6cffaac1125e149e8237 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Thu, 29 Aug 2024 11:54:15 +0100 Subject: [PATCH 04/30] Fixed compilation for normal GEMM --- include/cutlass/gemm/kernel/intel_pvc_gemm.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp b/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp index 24f799a8a7..a7924db7bb 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp @@ -53,7 +53,7 @@ class GemmUniversal< CollectiveEpilogue_, TileScheduler_, cute::enable_if_t - && cute::is_same_v>> + && !cute::is_same_v>> { public: // From b072873ab17af0fa734f3e52ab50eeaed7528a66 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Thu, 29 Aug 2024 12:40:40 +0100 Subject: [PATCH 05/30] Perform fixup using threadid instead of subgroup_id --- .../gemm/kernel/intel_pvc_gemm_streamk.hpp | 4 +-- .../intel_pvc_tile_scheduler_streamk.hpp | 27 +++++++++---------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp index 4b0e7dbff9..bd35b59e57 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp @@ -339,8 +339,8 @@ class GemmUniversal< ); // Perform reduction across splits, if needed - TileScheduler::template fixup( - params.scheduler, work_tile_info, sub_group_id, accumulators); + TileScheduler::template fixup( + params.scheduler, work_tile_info, accumulators); if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) { CollectiveEpilogue epilogue{params.epilogue, shared_storage.epilogue}; diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp index af929c2b87..b01dd4e882 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp @@ -283,31 +283,29 @@ class PersistentTileSchedulerIntelPVCStreamK { } // Performs the reduction across splits for a given output tile. -template +template CUTLASS_DEVICE static void fixup( Params const& params, WorkTileInfo const& work_tile_info, - const uint32_t subgroup_id, FrgTensorC& accumulators, uint32_t num_barriers = 1, uint32_t barrier_idx = 0) { static constexpr uint32_t Offset = static_cast(cutlass::arch::ReservedNamedBarriers::StreamkBarrier0); static constexpr uint32_t MaxNumNamedBarriers = 1; - using BarrierManager = NamedBarrierManager; - return fixup_helper( - params, work_tile_info, subgroup_id, accumulators, num_barriers, barrier_idx); + using BarrierManager = NamedBarrierManager; + return fixup_helper( + params, work_tile_info, accumulators, num_barriers, barrier_idx); } // Helper for performing the reduction across splits for a given output tile. - template + template CUTLASS_DEVICE static void fixup_helper( Params const& params, WorkTileInfo const& work_tile_info, - const uint32_t subgroup_id, FrgTensorC& accumulators, uint32_t num_barriers, uint32_t barrier_idx, @@ -326,24 +324,23 @@ template auto reduction_tile_idx = tile_idx; auto [first_peer_id, my_peer_id, last_peer_id] = tile_peer_range(params, tile_idx, static_cast(work_tile_info.K_idx)); auto reduction_peer_offset = 0; + int barrier_group_thread_idx = ThreadIdxX(); // Reductions use BlockStripedReduce with a width of BarrierManager::ThreadCount under the hood. // Thus, the start of the reduction space is the same across all threads in a warp group. int reduction_offset = (cute::size<0>(TileShape{}) * cute::size<1>(TileShape{}) * reduction_tile_idx * num_accumulator_mtxs) + reduction_peer_offset + - (size(accumulators) * subgroup_id * SubgroupSize); + (size(accumulators) * barrier_group_thread_idx * ThreadsPerBlock); ElementAccumulator* group_reduction_workspace = reinterpret_cast(params.reduction_workspace_) + reduction_offset; using AccumulatorArrayT = Array; - using BlockStripedReduceT = BlockStripedReduce; + using BlockStripedReduceT = BlockStripedReduce; AccumulatorArrayT* reduction_workspace_array = reinterpret_cast(group_reduction_workspace); AccumulatorArrayT* accumulator_array = reinterpret_cast(accumulators.data()); - int barrier_group_thread_idx = ThreadIdxX() % SubgroupSize; - // The number of tiles for which reduction is required is either: // (a) the total number of output tiles (in the case of split-K) // (b) the number of stream-K tiles (potentially multiplied by peer count if using separate reduction) @@ -371,7 +368,7 @@ template } else { // Wait until the preceding split added its accumulators - BarrierManager::wait_eq(barrier_idx, lock_workspace, ThreadIdxX(), lock_idx, work_tile_info.K_idx); + BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx); // Perform reduction in workspace BlockStripedReduceT::reduce(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx); @@ -382,16 +379,16 @@ template int32_t increment = work_tile_info.k_tile_count; // Signal our arrival - BarrierManager::arrive_inc(barrier_idx, lock_workspace, ThreadIdxX(), lock_idx, increment); + BarrierManager::arrive_inc(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, increment); } else { if (params.reduction_mode_ == ReductionMode::Deterministic) { // Wait until the preceding split added its accumulators - BarrierManager::wait_eq(barrier_idx, lock_workspace, ThreadIdxX(), lock_idx, work_tile_info.K_idx); + BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx); } else { // Wait unitl the first split has stored its accumulators - BarrierManager::wait_lt(barrier_idx, lock_workspace, ThreadIdxX(), lock_idx, 1); + BarrierManager::wait_lt(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, 1); } // The block computing the final split for the tile adds previously-reduced partials From 6be72be8344a7a1cf44005a4c98b4637385eabfc Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Thu, 29 Aug 2024 15:14:58 +0100 Subject: [PATCH 06/30] Fixed the k_idx offset for MMA atom and corrected the reduction offset calculation --- examples/sycl/pvc/pvc_gemm.cpp | 2 ++ include/cutlass/barrier.h | 2 +- include/cutlass/functional.h | 2 +- include/cutlass/gemm/collective/intel_pvc_mma.hpp | 5 +++-- .../cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp | 5 ++--- include/cutlass/gpu_generics.h | 4 ++-- 6 files changed, 11 insertions(+), 9 deletions(-) diff --git a/examples/sycl/pvc/pvc_gemm.cpp b/examples/sycl/pvc/pvc_gemm.cpp index 2d3d2665f5..3cd69255eb 100644 --- a/examples/sycl/pvc/pvc_gemm.cpp +++ b/examples/sycl/pvc/pvc_gemm.cpp @@ -244,6 +244,8 @@ struct ExampleRunner { GPU_Clock timer; timer.start(); for (int i = 0; i < options.iterations; ++i) { + if(workspace_size > 0) + gemm_op.initialize(arguments, workspace.get()); gemm_op.run(); } syclcompat::wait(); diff --git a/include/cutlass/barrier.h b/include/cutlass/barrier.h index f561ec664e..d5dfaf4a63 100644 --- a/include/cutlass/barrier.h +++ b/include/cutlass/barrier.h @@ -98,7 +98,7 @@ struct GenericBarrier { int state = 0; #ifdef SYCL_INTEL_TARGET - auto atm = sycl::atomic_ref(*ptr); return atm.load(sycl::memory_order::acquire); diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h index 9fa7bf7ba1..48ec345cec 100644 --- a/include/cutlass/functional.h +++ b/include/cutlass/functional.h @@ -782,7 +782,7 @@ struct atomic_add CUTLASS_DEVICE void operator()(T *ptr, const T &data) { -#if defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) || defined(CUTLASS_ENABLE_SYCL) atomicAdd(ptr, data); #endif } diff --git a/include/cutlass/gemm/collective/intel_pvc_mma.hpp b/include/cutlass/gemm/collective/intel_pvc_mma.hpp index 38239aaddc..a12a05ded2 100644 --- a/include/cutlass/gemm/collective/intel_pvc_mma.hpp +++ b/include/cutlass/gemm/collective/intel_pvc_mma.hpp @@ -231,7 +231,7 @@ struct CollectiveMma< // // Mainloop // - const int k_start_idx = crd2idx((*k_tile_iter), make_shape(K)); + const int k_start_idx = crd2idx((*k_tile_iter), make_shape(K)) * get<2>(SubgroupTileShape{}); int prefetch_k = k_start_idx; for (int i = 0; i < DispatchPolicy::Stages; i++) { @@ -241,13 +241,14 @@ struct CollectiveMma< } for (int k_tile = 0, k = k_start_idx; k_tile < k_tile_count; - ++k_tile, k += get<2>(SubgroupTileShape{}), prefetch_k += get<2>(SubgroupTileShape{})) { + ++k_tile, k += get<2>(SubgroupTileShape{})) { // Copy gmem to rmem for the first k_tile copy(mainloop.gmem_tiled_copy_a, gA(_, _, k), tAr); copy(mainloop.gmem_tiled_copy_b, gB(_, _, k), tBr); prefetch(mainloop.gmem_tiled_copy_a, tAi(_, _, prefetch_k)); prefetch(mainloop.gmem_tiled_copy_b, tBi(_, _, prefetch_k)); + prefetch_k += get<2>(SubgroupTileShape{}); cute::gemm(tiled_mma, accum, tAr_view, tBr_view, src_accum); } diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp index b01dd4e882..4a4d739b71 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp @@ -330,8 +330,7 @@ template // Thus, the start of the reduction space is the same across all threads in a warp group. int reduction_offset = (cute::size<0>(TileShape{}) * cute::size<1>(TileShape{}) * reduction_tile_idx * num_accumulator_mtxs) + - reduction_peer_offset + - (size(accumulators) * barrier_group_thread_idx * ThreadsPerBlock); + reduction_peer_offset; ElementAccumulator* group_reduction_workspace = reinterpret_cast(params.reduction_workspace_) + reduction_offset; @@ -739,7 +738,7 @@ CUTLASS_DEVICE work_tile_info.N_idx = work_idx_n; work_tile_info.L_idx = work_idx_l; - // if(linear_idx >= 32 && ThreadIdxX() == 0) + // if(linear_idx < 32 && ThreadIdxX() == 0) // printf("BlockID: %lu | k_tile_count: %d | M_idx: %lu | N_idx: %lu | K_idx: %lu | L_idx: %lu | ctas_per_grid_dim: %lu | output_tile_id: %lu\n", // BlockIdxY(), work_tile_info.k_tile_count, work_tile_info.M_idx, work_tile_info.N_idx, work_tile_info.K_idx, // work_tile_info.L_idx, remainder, output_tile_id); diff --git a/include/cutlass/gpu_generics.h b/include/cutlass/gpu_generics.h index 2c32760e83..a406791437 100644 --- a/include/cutlass/gpu_generics.h +++ b/include/cutlass/gpu_generics.h @@ -314,8 +314,8 @@ using cudaStream_t = void *; using dim3 = syclcompat::dim3; // Atomic - -CUTLASS_DEVICE int atomicAdd(int *address, int val) { +template +CUTLASS_DEVICE T atomicAdd(T *address, T val) { #if defined(__SYCL_DEVICE_ONLY__) return syclcompat::atomic_fetch_add(address, val); #endif From e00f9dade4bae2bbf19ac47df9fa9620c08fb9a3 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Thu, 29 Aug 2024 16:34:20 +0100 Subject: [PATCH 07/30] Use log2 for available_xecores --- .../gemm/kernel/intel_pvc_gemm_streamk.hpp | 30 +++++--- ...rsistent_tile_scheduler_params_streamk.hpp | 76 +------------------ .../intel_pvc_tile_scheduler_streamk.hpp | 4 +- 3 files changed, 23 insertions(+), 87 deletions(-) diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp index bd35b59e57..0f597a92cf 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp @@ -289,9 +289,25 @@ class GemmUniversal< constexpr auto workgroup_shape = WorkgroupTileShape{}; // (BLK_M,BLK_N,BLK_K) constexpr auto subgroup_shape = SubgroupTileShape{}; // (SUB_M,SUB_N,SUB_K) + constexpr int version = + is_same_v + ? 1 + : 2; + + auto k_residue = K - get<2>(subgroup_shape) * (K / get<2>(subgroup_shape)); // K - SUB_K * k_coord_max + + TiledMma tiled_mma; + CollectiveMainloop collective_mma; + CollectiveEpilogue epilogue{params.epilogue, shared_storage.epilogue}; + + Tensor accumulators = make_tensor(Shape, Int, Int>{}); + + const int m_offset = sub_group_id / CollectiveMainloop::sg_per_wg_n * get<0>(subgroup_shape); + const int n_offset = sub_group_id % CollectiveMainloop::sg_per_wg_n * get<1>(subgroup_shape); while (work_tile_info.is_valid()) { - const int m_coord = work_tile_info.M_idx * get<0>(workgroup_shape) + sub_group_id / CollectiveMainloop::sg_per_wg_n * get<0>(subgroup_shape); - const int n_coord = work_tile_info.N_idx * get<1>(workgroup_shape) + sub_group_id % CollectiveMainloop::sg_per_wg_n * get<1>(subgroup_shape); + const int m_coord = work_tile_info.M_idx * get<0>(workgroup_shape) + m_offset; + const int n_coord = work_tile_info.N_idx * get<1>(workgroup_shape) + n_offset; const int l_coord = work_tile_info.L_idx; // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work. @@ -304,11 +320,6 @@ class GemmUniversal< make_coord(m_coord, 0, 0), make_shape(_1{}, K, L), make_stride(Int{} * get<0>(MmaAtomShape()),_1{})); - constexpr int version = - is_same_v - ? 1 - : 2; Tensor tBi = params.mainloop.gmem_tiled_copy_b.get_pvc_tensor( make_coord(n_coord, 0, 0), @@ -318,13 +329,10 @@ class GemmUniversal< // Compute tile residues for predication auto m_max_coord = M - get<0>(subgroup_shape) * m_coord; // M - SUB_M * m_coord auto n_max_coord = N - get<1>(subgroup_shape) * n_coord; // N - SUB_N * n_coord - auto k_residue = K - get<2>(subgroup_shape) * (K / get<2>(subgroup_shape)); // K - SUB_K * k_coord_max auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue); - Tensor accumulators = make_tensor(Shape, Int, Int>{}); clear(accumulators); - CollectiveMainloop collective_mma; // Perform the collective scoped MMA collective_mma( accumulators, @@ -343,8 +351,6 @@ class GemmUniversal< params.scheduler, work_tile_info, accumulators); if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) { - CollectiveEpilogue epilogue{params.epilogue, shared_storage.epilogue}; - TiledMma tiled_mma; epilogue( problem_shape_MNKL, subgroup_shape, diff --git a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp index 0d22afeb8e..7e5e22e09b 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp @@ -84,10 +84,8 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { StreamK }; - FastDivmodU64Pow2 divmod_cluster_shape_major_{}; - FastDivmodU64Pow2 divmod_cluster_shape_minor_{}; FastDivmodU64 divmod_batch_{}; - FastDivmodU64 divmod_cluster_blk_major_{}; + FastDivmodU64 divmod_blk_major_{}; // We divide up the number of stream-K tiles amongst G groups of stream-K units. // The stream-K units within a group collaborate to comptue over the `sk_tiles / G` @@ -292,65 +290,6 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { return; } - bool do_separate_reduction = false; - // should_perform_separate_reduction( - // epilogue_subtile, sk_units, sk_tiles, dp_tiles, ctas_per_wave); - - // Determine the number of stream-K groups that will be used. Choosing the - // fast moving dimension of the underlying grid. - /*uint32_t max_groups_problem = problem_blocks_n; - - // Select the number of groups that will be use. We start with the maximum - // number of potential groups, and iterate down looking for a group size that - // evenly divides the stream-K units and tiles, and for which the resulting - // number of K tiles per stream-K unit remains above min_iters_per_sk_unit_ - - uint32_t groups = platform::min(max_groups_problem, uint32_t(max_sk_groups_)); - - // Grouping is disabled when separate reduction is used - // if (do_separate_reduction) { - // groups = 1; - // } - - uint32_t fallback_groups = 0; - - auto sk_splits_too_small = [&](uint32_t g) { - // Check whether the number of K tiles computed per stream-K unit is less - // than min_iters_per_sk_unit_ - auto total_sk_tiles = sk_tiles / g; - auto total_sk_k_tiles = total_sk_tiles * k_tiles_per_output_tile; - auto k_tiles_per_sk_unit = total_sk_k_tiles / (sk_units / g); - return k_tiles_per_sk_unit < min_iters_per_sk_unit_; - }; - - auto is_ideal_grouping = [&](uint32_t g) { - // An ideal grouping will evenly divide stream-K clusters, evenly divide - // stream-K tiles, and not result in stream-K splits that are too small. - return (sk_units % g == 0) && (sk_tiles % g == 0) && !sk_splits_too_small(g); - }; - - auto is_valid_grouping = [&](uint32_t g) { - // A grouping is valid, but not ideal, if it evenly divides the - // stream-K clusters and does not result in stream-K splits that are - // too small. Such a setting can be used as a fallback option in the - // case that an ideal grouping is not achievable - return sk_units % g == 0 && !sk_splits_too_small(g); - }; - - while (groups > 1 && !is_ideal_grouping(groups)) { - if (fallback_groups == 0 && is_valid_grouping(groups)) { - // Set fallback groups once in preference for a larger number of groups. - fallback_groups = groups; - } - --groups; - } - - // If groups == 1, we did not find a group count that satisfies all criteria. If we have - // found a fallback group count, use this instead. - if (groups == 1 && fallback_groups > 0) { - groups = fallback_groups; - }*/ - uint32_t groups = max_sk_groups_; auto sk_units_per_group = sk_units / groups; @@ -374,13 +313,6 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { // Use separate reduction when we have less than one wave of output tiles (dp_tiles == 0) // and when each tile will be operated on by at least two stream-K units (sk_units > 2 * sk_tiles) - // if (do_separate_reduction) { - // // Each reduction unit will reduce the partials of an epilogue subtile for - // // a given output tile and compute the epilogue. Thus, there are as many reduction - // // units as there are epilogue subtiles. - // reduction_units = sk_tiles * epilogue_subtile; - // } - // else if (decomposition_mode == DecompositionMode::Heuristic && sk_tiles < sk_units && sk_units % sk_tiles == 0) { // If the number of stream-K units is a multiple of the number of stream-K tiles, then // the problem can leverage a basic split-K decomposition for the stream-K tiles. @@ -403,9 +335,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { divmod_sk_groups_ = FastDivmodU64(static_cast(groups)); divmod_sk_units_per_group_ = FastDivmodU64(static_cast(sk_units / groups)); - divmod_cluster_shape_major_ = FastDivmodU64Pow2(1); - divmod_cluster_shape_minor_ = FastDivmodU64Pow2(1); - divmod_cluster_blk_major_ = FastDivmodU64(problem_blocks_n); + divmod_blk_major_ = FastDivmodU64(problem_blocks_n); divmod_splits_ = FastDivmod(splits); units_per_problem_ = static_cast(dp_units + sk_units); @@ -469,7 +399,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { dim3 problem_blocks, KernelHardwareInfo hw_info ) { - uint32_t available_sms = 32;//hw_info.sm_count / 8; + uint32_t available_sms = 1 << find_log2(hw_info.sm_count / 8); // printf("available_sms: %d\n", available_sms); auto possibly_truncate = [&](int x, int y) { return static_cast(platform::min(x, y)); diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp index 4a4d739b71..142bd3ad8f 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp @@ -415,7 +415,7 @@ template output_tile_index(Params const& params, WorkTileInfo const& work_tile_info) { uint64_t linear_idx_in_batch = Params::get_linear_idx_from_m_and_n( work_tile_info.M_idx, work_tile_info.N_idx, - params.divmod_cluster_blk_major_ + params.divmod_blk_major_ ); uint64_t tiles_mn = params.divmod_batch_.divisor; @@ -730,7 +730,7 @@ CUTLASS_DEVICE auto [work_idx_m, work_idx_n] = Params::get_work_idx_m_and_n( cta_per_grid_dim, - params.divmod_cluster_blk_major_ + params.divmod_blk_major_ ); // Set the M, N, and L block offsets From 224b31678724a260d032819770f231a83db35ddd Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Mon, 2 Sep 2024 12:03:17 +0100 Subject: [PATCH 08/30] SplitK working --- cmake/FindDPCPP.cmake | 1 + include/cutlass/arch/barrier.h | 15 ++++++++++++--- include/cutlass/functional.h | 2 +- ...c_persistent_tile_scheduler_params_streamk.hpp | 4 ++++ .../kernel/intel_pvc_tile_scheduler_streamk.hpp | 13 +++++++------ 5 files changed, 25 insertions(+), 10 deletions(-) diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake index 6ed98d3e4b..a18eda051c 100644 --- a/cmake/FindDPCPP.cmake +++ b/cmake/FindDPCPP.cmake @@ -43,6 +43,7 @@ set(DPCPP_COMPILE_ONLY_FLAGS "") if(NOT "${DPCPP_SYCL_TARGET}" STREQUAL "") list(APPEND DPCPP_FLAGS "-fsycl-targets=${DPCPP_SYCL_TARGET};") + list(APPEND DPCPP_FLAGS "-Xspirv-translator;--spirv-ext=+SPV_INTEL_split_barrier;-DSPV_INTEL_split_barrier;") endif() if(NOT "${DPCPP_USER_FLAGS}" STREQUAL "") diff --git a/include/cutlass/arch/barrier.h b/include/cutlass/arch/barrier.h index cd2d7be3cb..0ac6da8813 100644 --- a/include/cutlass/arch/barrier.h +++ b/include/cutlass/arch/barrier.h @@ -36,7 +36,11 @@ #include #include -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && (__CUDACC_VER_MAJOR__ >= 12) + +#if defined SYCL_INTEL_TARGET +SYCL_EXTERNAL __attribute__((convergent)) void __spirv_ControlBarrierWaitINTEL(int, int, int); +SYCL_EXTERNAL __attribute__((convergent)) void __spirv_ControlBarrierArriveINTEL(int, int, int); +#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && (__CUDACC_VER_MAJOR__ >= 12) #define CUDA_BARRIER_ENABLED 1 #else #define CUDA_BARRIER_ENABLED 0 @@ -151,7 +155,10 @@ class NamedBarrier { private: CUTLASS_DEVICE static void arrive_and_wait_internal(uint32_t num_threads, uint32_t barrier_id) { -#if CUDA_BARRIER_ENABLED +#if defined SYCL_INTEL_TARGET + __spirv_ControlBarrierArriveINTEL(2, 2, 0x4); + __spirv_ControlBarrierWaitINTEL(2, 2, 0x2); +#elif defined CUDA_BARRIER_ENABLED asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "r"(num_threads)); #elif defined(__CUDA_ARCH__) asm volatile ("brkpt;\n" ::); @@ -160,7 +167,9 @@ class NamedBarrier { CUTLASS_DEVICE static void arrive_internal(uint32_t num_threads, uint32_t barrier_id) { -#if CUDA_BARRIER_ENABLED +#if defined SYCL_INTEL_TARGET + __spirv_ControlBarrierArriveINTEL(2, 2, 0x4); +#elif CUDA_BARRIER_ENABLED asm volatile("bar.arrive %0, %1;" : : "r"(barrier_id), "r"(num_threads)); #elif defined(__CUDA_ARCH__) asm volatile ("brkpt;\n" ::); diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h index 48ec345cec..57835f8d35 100644 --- a/include/cutlass/functional.h +++ b/include/cutlass/functional.h @@ -782,7 +782,7 @@ struct atomic_add CUTLASS_DEVICE void operator()(T *ptr, const T &data) { -#if defined(__CUDA_ARCH__) || defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA_ARCH__) || defined(__SYCL_DEVICE_ONLY__) atomicAdd(ptr, data); #endif } diff --git a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp index 7e5e22e09b..5691059817 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp @@ -705,8 +705,11 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { divmod_batch_ = FastDivmodU64(blocks_m * blocks_n); divmod_tiles_per_output_tile_ = FastDivmod(k_tiles_per_output_tile); + divmod_sk_groups_ = FastDivmodU64(1u); divmod_splits_ = FastDivmod(splits); + divmod_blk_major_ = FastDivmodU64(blocks_n); units_per_problem_ = blocks_m * blocks_n * blocks_l; + big_units_ = k_tiles_per_output_tile % splits; reduction_workspace_ = reduction_workspace; reduction_mode_ = reduction_mode; divmod_k_tiles_per_sk_unit_ = FastDivmod(k_tiles_per_output_tile / splits); @@ -714,6 +717,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { // No stream-K work is performed for "basic" data-parallel and split-K decompositions sk_tiles_ = 0; sk_units_ = 0; + divmod_sk_units_per_group_ = FastDivmodU64(blocks_m * blocks_n * blocks_l); } private: diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp index 142bd3ad8f..b496ed33aa 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp @@ -322,7 +322,6 @@ template auto lock_idx = (tile_idx * num_barriers) + barrier_idx; auto reduction_tile_idx = tile_idx; - auto [first_peer_id, my_peer_id, last_peer_id] = tile_peer_range(params, tile_idx, static_cast(work_tile_info.K_idx)); auto reduction_peer_offset = 0; int barrier_group_thread_idx = ThreadIdxX(); @@ -386,7 +385,7 @@ template BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx); } else { - // Wait unitl the first split has stored its accumulators + // Wait until the first split has stored its accumulators BarrierManager::wait_lt(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, 1); } @@ -732,16 +731,18 @@ CUTLASS_DEVICE cta_per_grid_dim, params.divmod_blk_major_ ); - + if(params.divmod_splits_.divisor > 1) { + work_idx_l /= params.divmod_splits_.divisor; + } // Set the M, N, and L block offsets work_tile_info.M_idx = work_idx_m; work_tile_info.N_idx = work_idx_n; work_tile_info.L_idx = work_idx_l; - // if(linear_idx < 32 && ThreadIdxX() == 0) - // printf("BlockID: %lu | k_tile_count: %d | M_idx: %lu | N_idx: %lu | K_idx: %lu | L_idx: %lu | ctas_per_grid_dim: %lu | output_tile_id: %lu\n", + // if(ThreadIdxX() == 0) + // printf("BlockID: %lu | k_tile_count: %d | M_idx: %lu | N_idx: %lu | K_idx: %lu | L_idx: %lu | ctas_per_grid_dim: %lu | output_tile_id: %lu | unit_iter_start: %d | unit_iter_end: %d | tile_start: %d | tile_end: %d | split: %lu\n", // BlockIdxY(), work_tile_info.k_tile_count, work_tile_info.M_idx, work_tile_info.N_idx, work_tile_info.K_idx, - // work_tile_info.L_idx, remainder, output_tile_id); + // work_tile_info.L_idx, remainder, output_tile_id, unit_iter_start, unit_iter_end, tile_iter_start, tile_iter_end, split); } // Returns the starting and ending peer ID of this tile From 59b884f11cb004b80281e2ccd420ad05c808f6e5 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Mon, 2 Sep 2024 13:43:15 +0100 Subject: [PATCH 09/30] Minor cleanup * Need to fix splitK for batch > 1 --- .../gemm/kernel/intel_pvc_gemm_streamk.hpp | 47 +------ ...rsistent_tile_scheduler_params_streamk.hpp | 23 +--- .../intel_pvc_tile_scheduler_streamk.hpp | 124 ++---------------- 3 files changed, 23 insertions(+), 171 deletions(-) diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp index 0f597a92cf..3f9424b0cb 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp @@ -91,8 +91,6 @@ class GemmUniversal< using EpilogueArguments = typename CollectiveEpilogue::Arguments; using EpilogueParams = typename CollectiveEpilogue::Params; -// static_assert(ArchTag::kMinComputeCapability >= 90); - using TileSchedulerTag = TileScheduler_; using TileScheduler = typename detail::TileSchedulerSelector< TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler; @@ -109,13 +107,6 @@ class GemmUniversal< static constexpr int VecC = CollectiveMainloop::VecC; - /// Register requirement for Load and Math WGs -// static constexpr uint32_t LoadRegisterRequirement = 40; -// static constexpr uint32_t MmaRegisterRequirement = 232; - - // 1 stage ordered sequence between mainloop and epilogue producer load threads -// using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>; - // Kernel level shared memory storage struct SharedStorage { using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage; @@ -173,28 +164,15 @@ class GemmUniversal< // Calculate workspace pointers uint8_t* workspace_ptr = reinterpret_cast(workspace); - size_t workspace_offset = 0; - - void* scheduler_workspace = workspace_ptr; - workspace_offset += TileScheduler::template get_workspace_size( - args.scheduler, args.problem_shape, args.hw_info); - - void* epilogue_workspace = workspace_ptr + workspace_offset; - workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue); - void* mainloop_workspace = nullptr; - // Precompute the sub tiles numbers in epilogue, pass into tile scheduler. Therefore it will be used - // in separate reduction scheme for streamk case, NumEpilogueSubTiles default value is 1, which means - // subtile will not be used, therefore separate reduction will not be enabled. - // constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{}); TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments( - problem_shape_MNKL, TileShape{}, hw_info, args.scheduler, scheduler_workspace); + problem_shape_MNKL, TileShape{}, hw_info, args.scheduler, workspace_ptr); return { args.mode, problem_shape, - CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, mainloop_workspace), - CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace), + CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace_ptr), + CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace_ptr), hw_info, scheduler, workspace @@ -211,10 +189,8 @@ class GemmUniversal< static size_t get_workspace_size(Arguments const& args) { size_t workspace_size = 0; - workspace_size += TileScheduler::template get_workspace_size( args.scheduler, args.problem_shape, args.hw_info); - workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue); return workspace_size; } @@ -223,21 +199,9 @@ class GemmUniversal< CudaHostAdapter* cuda_adapter = nullptr) { Status status = Status::kSuccess; uint8_t* workspace_ptr = reinterpret_cast(workspace); - size_t workspace_offset = 0; status = TileScheduler::template initialize_workspace( - args.scheduler, workspace_ptr + workspace_offset, args.problem_shape, args.hw_info); - workspace_offset += TileScheduler::template get_workspace_size( - args.scheduler, args.problem_shape, args.hw_info); - if (status != Status::kSuccess) { - return status; - } - - status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter); - workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue); - if (status != Status::kSuccess) { - return status; - } + args.scheduler, workspace_ptr, args.problem_shape, args.hw_info); return status; } @@ -246,7 +210,6 @@ class GemmUniversal< static dim3 get_grid_shape(Params const& params) { // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently - // TileSchedulerArguments args{}; return TileScheduler::get_grid_shape(params.problem_shape, TileShape{}, params.hw_info); } @@ -262,7 +225,6 @@ class GemmUniversal< using X = Underscore; // Preconditions - // static_assert(size(TiledMma{}) == 256, "Cooperative kernel must have TiledMMA operating using 256 threads."); static_assert(size<0>(TileShape{}) >= 128, "Cooperative kernel requires Tile Size to be greater than or equal to 128 along the M-dimension."); @@ -305,6 +267,7 @@ class GemmUniversal< const int m_offset = sub_group_id / CollectiveMainloop::sg_per_wg_n * get<0>(subgroup_shape); const int n_offset = sub_group_id % CollectiveMainloop::sg_per_wg_n * get<1>(subgroup_shape); + while (work_tile_info.is_valid()) { const int m_coord = work_tile_info.M_idx * get<0>(workgroup_shape) + m_offset; const int n_coord = work_tile_info.N_idx * get<1>(workgroup_shape) + n_offset; diff --git a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp index 5691059817..1201347fd8 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp @@ -143,18 +143,6 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { // ktile start from even for each cta uint32_t ktile_start_alignment_count { 1u }; - // Returns the maximum number of peers that can collaborate on a given output tile - CUTLASS_HOST_DEVICE - static uint32_t - max_peers_per_tile(uint64_t sk_units, uint64_t sk_tiles) { - // When we can divide up our SK units to SK tiles evenly, the number of peers - // per SK tile is exactly (sk_units_ / sk_tiles_). In cases where this division - // is not exact, some tiles will need to be covered by additional SK units. Because - // the extra work can occur at both the beginning and the end of the SK tile, at - // most 2 extra peers will be needed. - return static_cast(sk_units / sk_tiles + 2); - } - // Initializes members. This variant of the method should only be used when // problem_shape and tile_shape contain modes of only rank 1. void @@ -397,14 +385,17 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { static dim3 get_grid_shape( dim3 problem_blocks, - KernelHardwareInfo hw_info + KernelHardwareInfo hw_info, + bool truncate_range = true ) { uint32_t available_sms = 1 << find_log2(hw_info.sm_count / 8); - // printf("available_sms: %d\n", available_sms); auto possibly_truncate = [&](int x, int y) { - return static_cast(platform::min(x, y)); + if(truncate_range) + return static_cast(platform::min(x, y)); + else + return static_cast(x); }; - return dim3{1, possibly_truncate(available_sms, problem_blocks.x * problem_blocks.y * problem_blocks.z), 1}; + return dim3{possibly_truncate(available_sms, problem_blocks.x * problem_blocks.y * problem_blocks.z), 1, 1}; } // Returns the number of stream-K tiles that will be computed amongst `output_tiles` total diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp index b496ed33aa..bc2e896e82 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp @@ -79,8 +79,7 @@ class PersistentTileSchedulerIntelPVCStreamK { CUTLASS_HOST_DEVICE bool is_valid() const { - // A work tile that computes no K tiles is invalid unless it is a separate-reduction work tile - // (which only performs reduction and epilogue) + // A work tile that computes no K tiles is invalid return k_tile_count > 0; } @@ -132,9 +131,9 @@ class PersistentTileSchedulerIntelPVCStreamK { // The splitting factor to be used in a split-K decomposition of the problem. // If this is set to a value greater than 1, stream-K decomposition logic // is bypassed in favor of a split-K decomposition. - int splits = 1; + int splits = 16; ReductionMode reduction_mode = ReductionMode::Deterministic; - DecompositionMode decomposition_mode = DecompositionMode::StreamK; + DecompositionMode decomposition_mode = DecompositionMode::SplitK; }; // Sink scheduler params as a member @@ -185,8 +184,7 @@ class PersistentTileSchedulerIntelPVCStreamK { CUTLASS_HOST_DEVICE PersistentTileSchedulerIntelPVCStreamK(Params const& params_) : scheduler_params(params_) { - // current_work_linear_idx_ = uint64_t(BlockIdxX()) + uint64_t(BlockIdxY()) * uint64_t(GridDimX()); - current_work_linear_idx_ = uint64_t(BlockIdxY()); + current_work_linear_idx_ = uint64_t(BlockIdxX()); } CUTLASS_DEVICE @@ -403,7 +401,6 @@ template // `is_final_split` will be set to `true` for the following scenarios, all of which must compute the epilogue: // 1. The tile is computed in data-parallel mode // 2. The tile is computed in split-/stream-K mode and this work unit represents the final split of the tile - // 3. The tile is computed in split-/stream-K mode and separate reduction is used, and this is a separate reduction unit return work_tile_info.is_valid() && work_tile_info.is_final_split(params.divmod_tiles_per_output_tile_.divisor); } @@ -519,19 +516,7 @@ CUTLASS_DEVICE WorkTileInfo& work_tile_info) { uint64_t output_tile_id = linear_idx; - if (linear_idx >= params.units_per_problem_ * params.divmod_splits_.divisor) { - // Separate-reduction work - /*auto cluster_size = params.get_cluster_size(); - // Divide up the linearized separate reduction units into clusters - auto cluster_linear_reduction_unit_idx = params.div_cluster_size((linear_idx - params.units_per_problem_)); - uint64_t cluster_tile_idx, epi_subtile_idx; - params.divmod_epilogue_subtile_(cluster_tile_idx, epi_subtile_idx, cluster_linear_reduction_unit_idx); - // Bring the linearized tile ID back into the space of tiles, rather than clusters - output_tile_id = cluster_tile_idx * cluster_size; - - work_tile_info.setup_separate_reduction(epi_subtile_idx);*/ - } - else if (linear_idx >= params.sk_units_ && params.divmod_splits_.divisor == 1) { + if (linear_idx >= params.sk_units_ && params.divmod_splits_.divisor == 1) { // Data-parallel work output_tile_id = linear_idx - params.sk_units_ + params.sk_tiles_; work_tile_info.K_idx = 0; @@ -539,38 +524,10 @@ CUTLASS_DEVICE work_tile_info.k_tile_remaining = params.divmod_tiles_per_output_tile_.divisor; } else { - // In the CUTLASS 2.x implementation of stream K, stream-K work is assigned to each stream-K - // threadblock individually. For the most part, the set of K iterations corresponding to stream-K - // work was divided amongst stream-K threadblocks, and a threadblock determined which tile - // it would compute a (potentially-partial) output tile for based on the space of k iterations - // assigned to it. This often results in stream-K threadblocks processing tiles with different - // offsets in the K dimension from one another. This can reduce locality, but is lmitied to the - // (generally few) waves of threadblocks assigned to compute stream-K work. - // - // With the introduction of threadblock clusters, there is additional benefit to maintaining - // locality in the K dimension: shared portions of operands can be multicasted to threadblocks - // within a cluster. Thus, we would like to ensure that the assignment of stream-K work to - // threadblocks respects the ability to perform multicasting. - // - // To do so, we divide up the linearized stream-K units into clusters and share the same K - // offsets for work within clusters. - - // auto cluster_linear_work_idx = params.div_cluster_size(linear_idx); - - // uint64_t group_idx; - // params.divmod_sk_groups_(cluster_linear_work_idx, group_idx, cluster_linear_work_idx); - - // // Determine whether we are in a "big group" that will process an additional - // // stream-K cluster tile. - // auto sk_cluster_tiles = params.div_cluster_size(params.sk_tiles_); - // auto sk_tiles_in_group = params.divmod_sk_groups_.divide(params.sk_tiles_); - // if (group_idx < params.big_groups_) { - // ++sk_cluster_tiles_in_group; - // } - // // Determine whether we are in a "big unit" within the group, that will process - // // an additional K chunk in the group. - auto sk_tiles_in_group = params.sk_tiles_;//sk_cluster_tiles_in_group * params.get_cluster_size(); + // Determine whether we are in a "big unit" within the group, that will process + // an additional K chunk in the group. + auto sk_tiles_in_group = params.sk_tiles_; auto k_tiles_in_group = sk_tiles_in_group * params.divmod_tiles_per_output_tile_.divisor; auto k_tiles_per_unit_in_group = params.divmod_sk_units_per_group_.divide(k_tiles_in_group); // auto big_units_in_group = params.div_cluster_size( @@ -696,17 +653,6 @@ CUTLASS_DEVICE // overall linearized space. output_tile_id = output_tile_id_in_group * params.divmod_sk_groups_.divisor; - // Bring the linearized tile ID back into the space of tiles, rather than clusters - // output_tile_id *= params.get_cluster_size(); - - // The final linearized tile ID is in units of the cluster dimension over which we rasterize. - // if (params.raster_order_ == RasterOrder::AlongN) { - // output_tile_id += cta_n_in_cluster * params.divmod_cluster_shape_minor_.divisor; - // } - // else { - // output_tile_id += cta_m_in_cluster * params.divmod_cluster_shape_minor_.divisor; - // } - // The unit's starting k iteration in the current tile is either the starting // iteration for the tile as a whole, or the starting k iteration for the unit // as a whole (if the latter is greater than the former). @@ -740,59 +686,11 @@ CUTLASS_DEVICE work_tile_info.L_idx = work_idx_l; // if(ThreadIdxX() == 0) - // printf("BlockID: %lu | k_tile_count: %d | M_idx: %lu | N_idx: %lu | K_idx: %lu | L_idx: %lu | ctas_per_grid_dim: %lu | output_tile_id: %lu | unit_iter_start: %d | unit_iter_end: %d | tile_start: %d | tile_end: %d | split: %lu\n", - // BlockIdxY(), work_tile_info.k_tile_count, work_tile_info.M_idx, work_tile_info.N_idx, work_tile_info.K_idx, - // work_tile_info.L_idx, remainder, output_tile_id, unit_iter_start, unit_iter_end, tile_iter_start, tile_iter_end, split); + // printf("BlockID: %lu | k_tile_count: %d | M_idx: %lu | N_idx: %lu | K_idx: %lu | L_idx: %lu | ctas_per_grid_dim: %lu | output_tile_id: %lu\n", + // BlockIdxX(), work_tile_info.k_tile_count, work_tile_info.M_idx, work_tile_info.N_idx, work_tile_info.K_idx, + // work_tile_info.L_idx, remainder, output_tile_id); } - // Returns the starting and ending peer ID of this tile - CUTLASS_HOST_DEVICE - static auto - tile_peer_range(Params const& params, uint32_t tile_idx, uint32_t cur_k_tile) { - auto tile_idx_in_cluster_path = tile_idx; - auto start_k_tile = params.divmod_tiles_per_output_tile_.divisor * tile_idx_in_cluster_path; - auto end_k_tile = start_k_tile + params.divmod_tiles_per_output_tile_.divisor - 1; - auto big_unit_k_tiles = params.big_units_ * (params.divmod_k_tiles_per_sk_unit_.divisor + 1); - - auto adjust_unit = [&](uint32_t k_tile, uint32_t unit_idx, uint32_t k_tiles_per_unit) { - auto unit_k_start = unit_idx * k_tiles_per_unit; - auto unit_k_end = unit_k_start + k_tiles_per_unit; - if (k_tile - start_k_tile < Params::min_iters_per_sk_unit_ && - unit_k_end - start_k_tile < Params::min_iters_per_sk_unit_) { - // k_tile is within the first min_iters_per_sk_unit_ K tiles of this output tile, - // and the stream-K unit computes fewer than min_iters_per_sk_unit_ K tiles for this - // output tile. This work will thus be subsumed by the next stream-K unit. - ++unit_idx; - } - - if (end_k_tile + 1 - k_tile < Params::min_iters_per_sk_unit_ && - end_k_tile + 1 - unit_k_start < Params::min_iters_per_sk_unit_) { - // k_tile is within the last min_iters_per_sk_unit_ K tiles of this output tile, - // and the stream-K unit computes fewer than min_iters_per_sk_unit_ K tiles for this - // output tile. This work will thus be subsumed by the previous stream-K unit. - --unit_idx; - } - - return unit_idx; - }; - - // Lambda to find the ID of the stream-K unit that computes this K tile - auto find_unit = [&](uint32_t k_tile) { - if (k_tile < big_unit_k_tiles) { - // The tile is within the "big unit range" - auto unit_idx = params.divmod_k_tiles_per_sk_big_unit_.divide(k_tile); - return static_cast(adjust_unit(k_tile, unit_idx, params.divmod_k_tiles_per_sk_big_unit_.divisor)); - } - else { - // The tile is after the "big unit range." Account for this by finding the "normal unit" - // that it belongs to, and then offsetting by the number of big units - auto unit_idx = params.divmod_k_tiles_per_sk_unit_.divide(k_tile - big_unit_k_tiles) + params.big_units_; - return static_cast(adjust_unit(k_tile, unit_idx, params.divmod_k_tiles_per_sk_unit_.divisor)); - } - }; - - return cute::make_tuple(find_unit(start_k_tile), find_unit(cur_k_tile), find_unit(end_k_tile)); - } }; } // namespace cutlass::gemm::kernel::detail From 4e9f9c3e7888adba9b731bea9d242020cd41cb80 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Mon, 2 Sep 2024 15:46:32 +0100 Subject: [PATCH 10/30] Fixed splitK for batch > 1 --- examples/sycl/pvc/pvc_gemm.cpp | 5 ++++- .../kernel/intel_pvc_tile_scheduler_streamk.hpp | 13 ++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/examples/sycl/pvc/pvc_gemm.cpp b/examples/sycl/pvc/pvc_gemm.cpp index 3cd69255eb..9224c2c071 100644 --- a/examples/sycl/pvc/pvc_gemm.cpp +++ b/examples/sycl/pvc/pvc_gemm.cpp @@ -47,6 +47,7 @@ #include "cutlass/util/reference/device/tensor_compare.h" #include "common.h" +#include "cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp" using namespace cute; /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -219,7 +220,9 @@ struct ExampleRunner { problem_size, {block_A.get(), stride_A, block_B.get(), stride_B}, {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}, - hw_info + hw_info, + {1, + cutlass::gemm::kernel::detail::PersistentTileSchedulerIntelPVCStreamKParams::DecompositionMode::StreamK} }; Gemm gemm_op; diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp index bc2e896e82..8d33a76c8c 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp @@ -131,9 +131,9 @@ class PersistentTileSchedulerIntelPVCStreamK { // The splitting factor to be used in a split-K decomposition of the problem. // If this is set to a value greater than 1, stream-K decomposition logic // is bypassed in favor of a split-K decomposition. - int splits = 16; + int splits = 1; ReductionMode reduction_mode = ReductionMode::Deterministic; - DecompositionMode decomposition_mode = DecompositionMode::SplitK; + DecompositionMode decomposition_mode = DecompositionMode::Heuristic; }; // Sink scheduler params as a member @@ -669,6 +669,11 @@ CUTLASS_DEVICE } uint64_t work_idx_l, remainder; + + if(params.divmod_splits_.divisor > 1) { + output_tile_id %= params.units_per_problem_; + } + params.divmod_batch_(work_idx_l, remainder, output_tile_id); uint64_t cta_per_grid_dim = remainder; //params.divmod_cluster_shape_minor_.divide(remainder); @@ -677,9 +682,7 @@ CUTLASS_DEVICE cta_per_grid_dim, params.divmod_blk_major_ ); - if(params.divmod_splits_.divisor > 1) { - work_idx_l /= params.divmod_splits_.divisor; - } + // Set the M, N, and L block offsets work_tile_info.M_idx = work_idx_m; work_tile_info.N_idx = work_idx_n; From 345dcaeb0c9b39bdcced6d6b7294cd9fe9377736 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Mon, 2 Sep 2024 17:10:12 +0100 Subject: [PATCH 11/30] Re-enabled GEMM Universal Adater specialization --- include/cutlass/gemm/device/gemm_universal_adapter.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/include/cutlass/gemm/device/gemm_universal_adapter.h b/include/cutlass/gemm/device/gemm_universal_adapter.h index e1bf6ee2d6..40a21b1078 100644 --- a/include/cutlass/gemm/device/gemm_universal_adapter.h +++ b/include/cutlass/gemm/device/gemm_universal_adapter.h @@ -91,8 +91,7 @@ class GemmUniversalAdapter; template class GemmUniversalAdapter< GemmKernel_, - // cute::enable_if_t::value>> - cute::enable_if_t> + cute::enable_if_t::value>> { public: using GemmKernel = GemmKernel_; @@ -506,7 +505,7 @@ class GemmUniversalAdapter< ////////////////////////////// CUTLASS 2.x API ///////////////////////////////// //////////////////////////////////////////////////////////////////////////////// -/*template +template class GemmUniversalAdapter< GemmKernel_, cute::enable_if_t::value>> @@ -667,7 +666,7 @@ class GemmUniversalAdapter< return status; } -};*/ +}; //////////////////////////////////////////////////////////////////////////////// From 05b487aedd20664653aa3c0fe51b5d1f95f94b88 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Mon, 2 Sep 2024 17:14:45 +0100 Subject: [PATCH 12/30] Update split barrier arguments --- include/cutlass/arch/barrier.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/cutlass/arch/barrier.h b/include/cutlass/arch/barrier.h index 0ac6da8813..756f21971d 100644 --- a/include/cutlass/arch/barrier.h +++ b/include/cutlass/arch/barrier.h @@ -156,8 +156,8 @@ class NamedBarrier { CUTLASS_DEVICE static void arrive_and_wait_internal(uint32_t num_threads, uint32_t barrier_id) { #if defined SYCL_INTEL_TARGET - __spirv_ControlBarrierArriveINTEL(2, 2, 0x4); - __spirv_ControlBarrierWaitINTEL(2, 2, 0x2); + __spirv_ControlBarrierArriveINTEL(2, 2, 0x0); + __spirv_ControlBarrierWaitINTEL(2, 2, 0x0); #elif defined CUDA_BARRIER_ENABLED asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "r"(num_threads)); #elif defined(__CUDA_ARCH__) @@ -168,7 +168,7 @@ class NamedBarrier { CUTLASS_DEVICE static void arrive_internal(uint32_t num_threads, uint32_t barrier_id) { #if defined SYCL_INTEL_TARGET - __spirv_ControlBarrierArriveINTEL(2, 2, 0x4); + __spirv_ControlBarrierArriveINTEL(2, 2, 0x0); #elif CUDA_BARRIER_ENABLED asm volatile("bar.arrive %0, %1;" : : "r"(barrier_id), "r"(num_threads)); #elif defined(__CUDA_ARCH__) From bff4801c33f00b6a36668f2d7f042826abd1a4ee Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Mon, 2 Sep 2024 17:28:10 +0100 Subject: [PATCH 13/30] Minor cleanup --- .../gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp index 8d33a76c8c..578eaa09cf 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp @@ -339,7 +339,7 @@ template // The number of tiles for which reduction is required is either: // (a) the total number of output tiles (in the case of split-K) - // (b) the number of stream-K tiles (potentially multiplied by peer count if using separate reduction) + // (b) the number of stream-K tiles // To calculate the total number of output tiles in the split-K case, we // note that, in the split-K case, the units_per_problem_ member of Params will be // the total number of output tiles. @@ -358,8 +358,7 @@ template if (!compute_epilogue(work_tile_info, params)) { if (work_tile_info.K_idx == 0) { - // The first peer initializes the workspace partials in the non-separate-reduction case, - // and all peers write to their own location in workspace when using separate reduction + // The first peer initializes the workspace partials BlockStripedReduceT::store(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx); } else { @@ -370,8 +369,8 @@ template BlockStripedReduceT::reduce(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx); } - // If separate reduction is being performed, each participating stream-K unit increments the barrier - // by only 1. Otherwise, increment by the K tile count that this unit has processed. + // Each participating stream-K unit increments the barrier by the K tile count that this unit has + // processed. int32_t increment = work_tile_info.k_tile_count; // Signal our arrival From 3300bf737bf737b98ddb446f4c8f713c0b1e7025 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Tue, 3 Sep 2024 11:14:23 +0100 Subject: [PATCH 14/30] Changed initialization to workspace only --- examples/sycl/pvc/pvc_gemm.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/sycl/pvc/pvc_gemm.cpp b/examples/sycl/pvc/pvc_gemm.cpp index 9224c2c071..18e2b48c10 100644 --- a/examples/sycl/pvc/pvc_gemm.cpp +++ b/examples/sycl/pvc/pvc_gemm.cpp @@ -221,8 +221,8 @@ struct ExampleRunner { {block_A.get(), stride_A, block_B.get(), stride_B}, {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}, hw_info, - {1, - cutlass::gemm::kernel::detail::PersistentTileSchedulerIntelPVCStreamKParams::DecompositionMode::StreamK} + {16, + cutlass::gemm::kernel::detail::PersistentTileSchedulerIntelPVCStreamKParams::DecompositionMode::SplitK} }; Gemm gemm_op; @@ -248,7 +248,7 @@ struct ExampleRunner { timer.start(); for (int i = 0; i < options.iterations; ++i) { if(workspace_size > 0) - gemm_op.initialize(arguments, workspace.get()); + Gemm::GemmKernel::initialize_workspace(arguments, workspace.get()); gemm_op.run(); } syclcompat::wait(); From f559e31558255128b5a546533a6816913813bef9 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Tue, 3 Sep 2024 14:05:49 +0100 Subject: [PATCH 15/30] Fix CI failure --- include/cutlass/barrier.h | 4 ++-- include/cutlass/gemm/kernel/tile_scheduler.hpp | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/cutlass/barrier.h b/include/cutlass/barrier.h index d5dfaf4a63..1cfc73c1cb 100644 --- a/include/cutlass/barrier.h +++ b/include/cutlass/barrier.h @@ -97,12 +97,12 @@ struct GenericBarrier { { int state = 0; -#ifdef SYCL_INTEL_TARGET +#if defined (SYCL_INTEL_TARGET) auto atm = sycl::atomic_ref(*ptr); return atm.load(sycl::memory_order::acquire); -#elif defined (__CUDA_ARCH__ >= 700) +#elif (__CUDA_ARCH__ >= 700) /// SM70 and newer use memory consistency qualifiers // Acquire pattern using acquire modifier diff --git a/include/cutlass/gemm/kernel/tile_scheduler.hpp b/include/cutlass/gemm/kernel/tile_scheduler.hpp index e354e7a48c..524884ed6f 100644 --- a/include/cutlass/gemm/kernel/tile_scheduler.hpp +++ b/include/cutlass/gemm/kernel/tile_scheduler.hpp @@ -130,6 +130,7 @@ struct TileSchedulerSelector< using Scheduler = PersistentTileSchedulerSm90StreamK; }; +#if defined (SYCL_INTEL_TARGET) template < class TileShape, class ClusterShape @@ -142,6 +143,7 @@ struct TileSchedulerSelector< > { using Scheduler = PersistentTileSchedulerIntelPVCStreamK; }; +#endif template < class TileShape, From bcf812e7bf3be9af957e8b972219921b47dff742 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Wed, 4 Sep 2024 12:04:25 +0100 Subject: [PATCH 16/30] Added support for scheduling non-uniform tiles --- .../intel_pvc_tile_scheduler_streamk.hpp | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp index 578eaa09cf..554724c39b 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp @@ -529,15 +529,14 @@ CUTLASS_DEVICE auto sk_tiles_in_group = params.sk_tiles_; auto k_tiles_in_group = sk_tiles_in_group * params.divmod_tiles_per_output_tile_.divisor; auto k_tiles_per_unit_in_group = params.divmod_sk_units_per_group_.divide(k_tiles_in_group); - // auto big_units_in_group = params.div_cluster_size( - // k_tiles_in_group - (k_tiles_per_unit_in_group * params.divmod_sk_units_per_group_.divisor)); + auto big_units_in_group = k_tiles_in_group - (k_tiles_per_unit_in_group * params.divmod_sk_units_per_group_.divisor); uint64_t split; params.divmod_sk_units_per_group_(split, output_tile_id, output_tile_id); bool is_split_k = params.divmod_splits_.divisor > 1; - auto big_unit_cmp_lhs = output_tile_id; - auto big_unit_cmp_rhs = /*is_split_k ? */params.big_units_;// : big_units_in_group; + auto big_unit_cmp_lhs = is_split_k ? split : output_tile_id; + auto big_unit_cmp_rhs = is_split_k ? params.big_units_ : big_units_in_group; auto linear_idx_mult = is_split_k ? params.divmod_tiles_per_output_tile_.divisor : k_tiles_per_unit_in_group; auto k_tiles_per_split = is_split_k ? params.divmod_k_tiles_per_sk_unit_.divisor : k_tiles_per_unit_in_group; @@ -548,18 +547,18 @@ CUTLASS_DEVICE // compute one extra iteration. If there are any big units, they will be the first // in the linearized ID space. auto k_tiles_in_my_split = k_tiles_per_split; - // if (big_unit_cmp_lhs < big_unit_cmp_rhs) { - // // Since the "big units" are the first units in the linearized ID space, each - // // of the units preceding this big unit computed one extra iteration. Thus, - // // we must offset our start iteration by the number of units that precede - // // the current unit in the linearized ID space. - // unit_iter_start += big_unit_cmp_lhs; - // ++k_tiles_in_my_split; - // } - // else { - // // Increment by one for each of the big clusters (since all big units precede this unit) - // unit_iter_start += big_unit_cmp_rhs; - // } + if (big_unit_cmp_lhs < big_unit_cmp_rhs) { + // Since the "big units" are the first units in the linearized ID space, each + // of the units preceding this big unit computed one extra iteration. Thus, + // we must offset our start iteration by the number of units that precede + // the current unit in the linearized ID space. + unit_iter_start += big_unit_cmp_lhs; + ++k_tiles_in_my_split; + } + else { + // Increment by one for each of the big clusters (since all big units precede this unit) + unit_iter_start += big_unit_cmp_rhs; + } if (!is_split_k) { // Adjust the unit starting position and number of tiles to avoid From 1544d51af38389cb5a810c3f6acecb5b69c05f52 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Wed, 4 Sep 2024 12:09:00 +0100 Subject: [PATCH 17/30] Only include split barrier flags for PVC --- cmake/FindDPCPP.cmake | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake index a18eda051c..ba60ed2f8b 100644 --- a/cmake/FindDPCPP.cmake +++ b/cmake/FindDPCPP.cmake @@ -43,7 +43,6 @@ set(DPCPP_COMPILE_ONLY_FLAGS "") if(NOT "${DPCPP_SYCL_TARGET}" STREQUAL "") list(APPEND DPCPP_FLAGS "-fsycl-targets=${DPCPP_SYCL_TARGET};") - list(APPEND DPCPP_FLAGS "-Xspirv-translator;--spirv-ext=+SPV_INTEL_split_barrier;-DSPV_INTEL_split_barrier;") endif() if(NOT "${DPCPP_USER_FLAGS}" STREQUAL "") @@ -58,6 +57,10 @@ if(NOT "${DPCPP_SYCL_ARCH}" STREQUAL "") endif() endif() +if("${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_pvc") + list(APPEND DPCPP_FLAGS "-Xspirv-translator;--spirv-ext=+SPV_INTEL_split_barrier;-DSPV_INTEL_split_barrier;") +endif() + if(UNIX) set_target_properties(DPCPP::DPCPP PROPERTIES INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS};${DPCPP_COMPILE_ONLY_FLAGS}" From 2ab1cd81c9be5c8de4ed9152edc2f1609bd69918 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Wed, 4 Sep 2024 12:44:28 +0100 Subject: [PATCH 18/30] Test --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 64cf206671..b63296b2e4 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -401,6 +401,8 @@ endif() # Warnings-as-error exceptions and warning suppressions for Clang builds if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=unused-command-line-argument ") + list(APPEND CUTLASS_CUDA_NVCC_FLAGS "-Wno-error=unused-command-line-argument") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=implicit-int-conversion ") list(APPEND CUTLASS_CUDA_NVCC_FLAGS "-Wno-error=implicit-int-conversion" ) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pass-failed ") From 750ee3a7d8694a3075a236581e61016aeb57b71b Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Wed, 4 Sep 2024 14:32:45 +0100 Subject: [PATCH 19/30] Code cleanup --- .../gemm/kernel/intel_pvc_gemm_streamk.hpp | 6 +- ...rsistent_tile_scheduler_params_streamk.hpp | 107 ++++++++---------- .../intel_pvc_tile_scheduler_streamk.hpp | 19 ++-- 3 files changed, 54 insertions(+), 78 deletions(-) diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp index 3f9424b0cb..f699a8fd6b 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp @@ -221,12 +221,8 @@ class GemmUniversal< CUTLASS_DEVICE void operator()(Params const& params, char* smem_buf) { - using namespace cute; - using X = Underscore; - // Preconditions - static_assert(size<0>(TileShape{}) >= 128, - "Cooperative kernel requires Tile Size to be greater than or equal to 128 along the M-dimension."); + CUTE_STATIC_ASSERT(is_static::value); static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>."); static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>."); diff --git a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp index 1201347fd8..85fcfdbefb 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp @@ -51,18 +51,18 @@ namespace detail { // Parameters for Intel PVC persistent stream-K scheduler struct PersistentTileSchedulerIntelPVCStreamKParams { - // Strategies for computing reductions between CTAs computing portions of a given output tile + // Strategies for computing reductions between work-groups computing portions of a given output tile enum class ReductionMode { - // Participating CTAs perform reduction in a turnstile fashion in order of the K extent - // covered by each CTA. This requires a lock to be held exclusively be the CTA that is + // Participating work-groups perform reduction in a turnstile fashion in order of the K extent + // covered by each work-group. This requires a lock to be held exclusively be the work-group that is // currently accumulating. // // Turnstile accumulation ensures deterministic numeric behavior when using this mode. Deterministic, - // Participating CTAs perform reduction atomically to the same workspace (mostly) without locking. - // Locks are used only to wait for the first CTA to write its partial values (to initialize the - // workspace), and for all but the final CTA to have accumulated (so that the final CTA can load + // Participating work-groups perform reduction atomically to the same workspace (mostly) without locking. + // Locks are used only to wait for the first work-group to write its partial values (to initialize the + // workspace), and for all but the final work-group to have accumulated (so that the final work-group can load // the accumulated value and accumulate it into registers on top of which the epilogue will // be performed). // @@ -87,12 +87,8 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { FastDivmodU64 divmod_batch_{}; FastDivmodU64 divmod_blk_major_{}; - // We divide up the number of stream-K tiles amongst G groups of stream-K units. - // The stream-K units within a group collaborate to comptue over the `sk_tiles / G` - // tiles assigned to that group. Non-unit group sizes can help to preserve L2 locality of - // partial chunks computed by stream-K units -- units 0 in each group will compute identical K extents - // of tiles that would be assigned in the same wave according to the rasterization order of the - // data-parallel formulation of the problem. + // Divide up the number of stream-K tiles amongst G groups of stream-K units. + // Currently defaults to 1 since we don't create groups for PVC. FastDivmodU64 divmod_sk_groups_{}; // Number of stream-K units in each group @@ -108,11 +104,9 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { // Number of stream-K or split-K work units that compute an extra k iteration. // This is done to handle residuals in dividing up the k iteration space. - // For stream-K, since the actual assignment of work to stream-K units will be done - // at the granularity of a cluster, we store only the number of big clusters. uint32_t big_units_ = 0; - // The number of groups of stream-K units that will process an extra stream-K tile cluster. + // The number of groups of stream-K units that will process an extra stream-K tile. uint32_t big_groups_ = 0; // Workspace for holding partial accumulators to be reduced across stream-K/split-K units @@ -131,7 +125,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { // processes one more K chunk than a "normal" stream-K unit. FastDivmod divmod_k_tiles_per_sk_big_unit_{}; - // Strategy to use when reducing between collaborating CTAs + // Strategy to use when reducing between collaborating work-groups ReductionMode reduction_mode_ = ReductionMode::Deterministic; // Minimum number of k tiles that can be assigned to a stream-K unit @@ -156,7 +150,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { void* workspace ) { - dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, tile_shape); + dim3 problem_blocks = get_tiled_wg_shape_mnl(problem_shape, tile_shape); // Number of k tiles in each output tile uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k(); @@ -171,8 +165,8 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { ); } - // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions. - // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1, + // Version of initialize that takes in as input the number of work-groups in the M and N and L dimensions. + // This is useful for calculating the tiled shape when a mode of problem and/or work-group shape has rank > 1, // for which using CuTe algebra for calculating tile shapes is easiest. void initialize( @@ -232,18 +226,17 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { return; } - // Calculate the maximum number of blocks from clusters of shape cluster_shape that we - // can fit within sm_count SMs. + // Calculate the maximum number of blocks that we can fit within sm_count SMs. dim3 grid = get_grid_shape( problem_blocks, hw_info ); - uint64_t ctas_per_wave = grid.x * grid.y; + uint64_t wgs_per_wave = grid.x * grid.y; // The number of output tiles to be computed in stream-K and data-parallel fashion, respectively. uint32_t sk_tiles = get_num_sk_tiles( output_tiles, - ctas_per_wave, + wgs_per_wave, k_tiles_per_output_tile, decomposition_mode ); @@ -259,8 +252,8 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { // is needed per data-parallel tile. uint64_t dp_units = dp_tiles; - uint64_t ctas_per_sk_wave = ctas_per_wave; - uint64_t sk_units = get_num_sk_units(ctas_per_sk_wave, sk_tiles, k_tiles_per_output_tile); + uint64_t wgs_per_sk_wave = wgs_per_wave; + uint64_t sk_units = get_num_sk_units(wgs_per_sk_wave, sk_tiles, k_tiles_per_output_tile); if (decomposition_mode == DecompositionMode::DataParallel || (decomposition_mode == DecompositionMode::Heuristic && sk_tiles == 0) || @@ -282,13 +275,9 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { auto sk_units_per_group = sk_units / groups; - // sk_tiles is guaranteed to be divisible by cluster_size because it is calculated as: - // sk_tiles = (waves <= 2) ? total_tiles : (sm_count + (total_tiles % sm_count)) - // Both total_tiles and sm_count are multiples of cluster size due to padding added - // prior to kernel launch. uint64_t sk_tiles_per_group = sk_tiles / groups; - // Groups that will process an extra stream-K tile cluster. These differ from "big_units," which + // Groups that will process an extra stream-K tile. These differ from "big_units," which // are stream-K units within a group that process an extra K chunk. uint64_t sk_big_groups = sk_tiles % groups; @@ -345,10 +334,10 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { cute::tuple get_work_idx_m_and_n( uint64_t blk_per_grid_dim, - FastDivmodU64 const& divmod_cluster_blk_major) { + FastDivmodU64 const& divmod_blk_major) { uint64_t m_idx, n_idx; - divmod_cluster_blk_major(m_idx, n_idx, blk_per_grid_dim); + divmod_blk_major(m_idx, n_idx, blk_per_grid_dim); auto i = static_cast(m_idx); auto j = static_cast(n_idx); @@ -362,15 +351,15 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { get_linear_idx_from_m_and_n( int32_t tile_m, int32_t tile_n, - FastDivmodU64 const& divmod_cluster_blk_major) { - return static_cast(tile_m * divmod_cluster_blk_major.divisor + tile_n); + FastDivmodU64 const& divmod_blk_major) { + return static_cast(tile_m * divmod_blk_major.divisor + tile_n); } - // Get the number of CTA tiles in this problem. This variant of the method should only be used when + // Get the number of work-group tiles in this problem. This variant of the method should only be used when // problem_shape and tile_shape contain modes of only rank 1. CUTLASS_HOST_DEVICE static dim3 - get_tiled_cta_shape_mnl(BatchedGemmCoord problem_shape, GemmCoord cta_shape) { + get_tiled_wg_shape_mnl(BatchedGemmCoord problem_shape, GemmCoord cta_shape) { auto cta_m = (problem_shape.m() + cta_shape.m() - 1) / cta_shape.m(); auto cta_n = (problem_shape.n() + cta_shape.n() - 1) / cta_shape.n(); @@ -399,16 +388,16 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { } // Returns the number of stream-K tiles that will be computed amongst `output_tiles` total - // output tiles on a device with `ctas_per_wave` CTAs in each wave. + // output tiles on a device with `wgs_per_wave` work-groups in each wave. static uint32_t get_num_sk_tiles( uint64_t output_tiles, - uint64_t ctas_per_wave, + uint64_t wgs_per_wave, uint32_t k_tiles_per_output_tile, DecompositionMode decomposition_mode ) { - uint32_t full_waves = static_cast(output_tiles / ctas_per_wave); - uint32_t total_waves = static_cast((output_tiles + ctas_per_wave - 1) / ctas_per_wave); + uint32_t full_waves = static_cast(output_tiles / wgs_per_wave); + uint32_t total_waves = static_cast((output_tiles + wgs_per_wave - 1) / wgs_per_wave); if (decomposition_mode == DecompositionMode::DataParallel || decomposition_mode == DecompositionMode::SplitK) { @@ -420,7 +409,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { // that full_waves == total_waves - 1 in this case, the number of data-parallel // waves is simply full_waves-1 (unless full_waves == 0). uint32_t dp_waves = full_waves > 1 ? full_waves - 1 : 0; - uint64_t dp_tiles = dp_waves * ctas_per_wave; + uint64_t dp_tiles = dp_waves * wgs_per_wave; uint64_t sk_tiles = output_tiles - dp_tiles; if (decomposition_mode == DecompositionMode::Heuristic) { @@ -436,8 +425,8 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { // Rudimentary heuristic: prefer data-parallel decomposition if we have more than // one wave and the tail wave is more than half full. This is subject to change. - uint64_t tail_tiles = output_tiles - (full_waves * ctas_per_wave); - if (2 * tail_tiles >= ctas_per_wave) { + uint64_t tail_tiles = output_tiles - (full_waves * wgs_per_wave); + if (2 * tail_tiles >= wgs_per_wave) { return 0; } } @@ -447,7 +436,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { CUTLASS_HOST_DEVICE static uint64_t - get_num_sk_units(uint64_t ctas_per_sk_wave, uint32_t sk_tiles, uint32_t k_tiles_per_output_tile) { + get_num_sk_units(uint64_t wgs_per_sk_wave, uint32_t sk_tiles, uint32_t k_tiles_per_output_tile) { // If there are stream-K tiles to compute and a sufficiently large number of k iterations // across them, they will be covered by a single wave of persistent threadblocks. Thus, there // will be as many work units as there are threadblocks in a single wave. @@ -457,16 +446,16 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { // set of blocks. // Calculate the number of stream-K units that would be needed if each stream-K unit - // computed the minimum allowable k iterations. Truncate this to be in units of clusters. + // computed the minimum allowable k iterations. // Number of k iterations computed by the stream-K units as a whole uint64_t k_tiles_sk_total = k_tiles_per_output_tile * sk_tiles; // Calculate the number of stream-K units that would be needed if each stream-K unit - // computed the minimum allowable k iterations. Truncate this to be in units of clusters. + // computed the minimum allowable k iterations. uint64_t min_sized_sk_units = (k_tiles_sk_total / min_iters_per_sk_unit_); - uint64_t sk_units = platform::min(ctas_per_sk_wave, min_sized_sk_units); + uint64_t sk_units = platform::min(wgs_per_sk_wave, min_sized_sk_units); return sk_units; } @@ -528,23 +517,19 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { problem_blocks, new_hw_info ); - uint64_t ctas_per_wave = grid.x * grid.y; + uint64_t wgs_per_wave = grid.x * grid.y; uint32_t sk_tiles = get_num_sk_tiles( output_tiles, - ctas_per_wave, + wgs_per_wave, static_cast(k_tiles_per_output_tile), decomposition_mode ); - uint64_t ctas_per_sk_wave = ctas_per_wave; - uint64_t sk_units = get_num_sk_units(ctas_per_sk_wave, sk_tiles, k_tiles_per_output_tile); + uint64_t wgs_per_sk_wave = wgs_per_wave; + uint64_t sk_units = get_num_sk_units(wgs_per_sk_wave, sk_tiles, k_tiles_per_output_tile); uint64_t dp_tiles = output_tiles - sk_tiles; uint64_t reduction_tiles = sk_tiles; - // Though separate reduction requires a larger reduction workspace, only one barrier - // is needed per output tile. Each peer will increment the barrier by one once the peer has - // written its accumulator to scratch space. The separate reduction unit will only begin - // performing the reduction when the barrier has reached the number of peers for the output tile. barrier_workspace_size = get_barrier_workspace_size(sk_tiles, barrier_bits); reduction_workspace_size = get_reduction_workspace_size(reduction_tiles, tile_shape, accumulator_bits); } @@ -562,7 +547,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { uint32_t barrier_bits, uint32_t element_accumulator_bits) { - dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, tile_shape); + dim3 problem_blocks = get_tiled_wg_shape_mnl(problem_shape, tile_shape); uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k(); return get_workspace_size( @@ -577,8 +562,8 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { ); } - // Version of get_workspace_size that takes in as input the number of CTAs in the M and N dimensions. - // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1, + // Version of get_workspace_size that takes in as input the number of work-groups in the M and N dimensions. + // This is useful for calculating the tiled shape when a mode of problem and/or work-group shape has rank > 1, // for which using CuTe algebra for calculating tile shapes is easiest. static size_t get_workspace_size( @@ -623,7 +608,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { uint32_t barrier_bits, uint32_t element_accumulator_bits) { - dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, tile_shape); + dim3 problem_blocks = get_tiled_wg_shape_mnl(problem_shape, tile_shape); uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k(); return initialize_workspace( @@ -639,8 +624,8 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { ); } - // Version of initialize_workspace that takes in as input the number of CTAs in the M and N dimensions. - // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1, + // Version of initialize_workspace that takes in as input the number of work-groups in the M and N dimensions. + // This is useful for calculating the tiled shape when a mode of problem and/or work-group shape has rank > 1, // for which using CuTe algebra for calculating tile shapes is easiest. static cutlass::Status initialize_workspace( diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp index 554724c39b..69ad69a374 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp @@ -155,7 +155,7 @@ class PersistentTileSchedulerIntelPVCStreamK { static_assert(cute::is_static::value); auto problem_shape_mnkl = cute::append<4>(problem_shape, cute::Int<1>{}); - dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape); + dim3 problem_blocks = get_tiled_wg_shape_mnl(problem_shape_mnkl, tile_shape); uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{}))); Params params; @@ -250,8 +250,8 @@ class PersistentTileSchedulerIntelPVCStreamK { template CUTLASS_HOST_DEVICE static dim3 - get_tiled_cta_shape_mnl(ProblemShape problem_shape_mnkl, TileShape cta_shape) { - return Params::get_tiled_cta_shape_mnl(to_gemm_coord(problem_shape_mnkl), to_gemm_coord(cta_shape)); + get_tiled_wg_shape_mnl(ProblemShape problem_shape_mnkl, TileShape cta_shape) { + return Params::get_tiled_wg_shape_mnl(to_gemm_coord(problem_shape_mnkl), to_gemm_coord(cta_shape)); } // Given the cluster shape, computes the physical grid we should launch. @@ -264,7 +264,7 @@ class PersistentTileSchedulerIntelPVCStreamK { KernelHardwareInfo hw_info) { auto problem_shape_mnkl = cute::append<4>(problem_shape, cute::Int<1>{}); - dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape); + dim3 problem_blocks = get_tiled_wg_shape_mnl(problem_shape_mnkl, tile_shape); return Params::get_grid_shape( problem_blocks, @@ -428,7 +428,7 @@ template TileShape tile_shape; - dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape); + dim3 problem_blocks = get_tiled_wg_shape_mnl(problem_shape_mnkl, tile_shape); uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{}))); return Params::get_workspace_size( @@ -455,7 +455,7 @@ template TileShape tile_shape; - dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape); + dim3 problem_blocks = get_tiled_wg_shape_mnl(problem_shape_mnkl, tile_shape); uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{}))); return Params::initialize_workspace( @@ -674,7 +674,7 @@ CUTLASS_DEVICE params.divmod_batch_(work_idx_l, remainder, output_tile_id); - uint64_t cta_per_grid_dim = remainder; //params.divmod_cluster_shape_minor_.divide(remainder); + uint64_t cta_per_grid_dim = remainder; auto [work_idx_m, work_idx_n] = Params::get_work_idx_m_and_n( cta_per_grid_dim, @@ -685,11 +685,6 @@ CUTLASS_DEVICE work_tile_info.M_idx = work_idx_m; work_tile_info.N_idx = work_idx_n; work_tile_info.L_idx = work_idx_l; - - // if(ThreadIdxX() == 0) - // printf("BlockID: %lu | k_tile_count: %d | M_idx: %lu | N_idx: %lu | K_idx: %lu | L_idx: %lu | ctas_per_grid_dim: %lu | output_tile_id: %lu\n", - // BlockIdxX(), work_tile_info.k_tile_count, work_tile_info.M_idx, work_tile_info.N_idx, work_tile_info.K_idx, - // work_tile_info.L_idx, remainder, output_tile_id); } }; From 8924208b6fbf38e9bdc6cfff8fd87c9632514bc7 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Wed, 4 Sep 2024 14:50:33 +0100 Subject: [PATCH 20/30] Add separate example for StreamK --- examples/sycl/pvc/CMakeLists.txt | 5 + examples/sycl/pvc/pvc_gemm.cpp | 10 +- examples/sycl/pvc/pvc_gemm_streamk.cpp | 383 +++++++++++++++++++++++++ 3 files changed, 390 insertions(+), 8 deletions(-) create mode 100644 examples/sycl/pvc/pvc_gemm_streamk.cpp diff --git a/examples/sycl/pvc/CMakeLists.txt b/examples/sycl/pvc/CMakeLists.txt index 322896e20e..f9c5fca18c 100644 --- a/examples/sycl/pvc/CMakeLists.txt +++ b/examples/sycl/pvc/CMakeLists.txt @@ -41,3 +41,8 @@ cutlass_example_add_executable( pvc_collective_builder pvc_collective_builder.cpp ) + +cutlass_example_add_executable( + pvc_gemm_streamk + pvc_gemm_streamk.cpp +) diff --git a/examples/sycl/pvc/pvc_gemm.cpp b/examples/sycl/pvc/pvc_gemm.cpp index 18e2b48c10..5316e8f2cc 100644 --- a/examples/sycl/pvc/pvc_gemm.cpp +++ b/examples/sycl/pvc/pvc_gemm.cpp @@ -47,7 +47,6 @@ #include "cutlass/util/reference/device/tensor_compare.h" #include "common.h" -#include "cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp" using namespace cute; /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -220,9 +219,7 @@ struct ExampleRunner { problem_size, {block_A.get(), stride_A, block_B.get(), stride_B}, {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}, - hw_info, - {16, - cutlass::gemm::kernel::detail::PersistentTileSchedulerIntelPVCStreamKParams::DecompositionMode::SplitK} + hw_info }; Gemm gemm_op; @@ -247,8 +244,6 @@ struct ExampleRunner { GPU_Clock timer; timer.start(); for (int i = 0; i < options.iterations; ++i) { - if(workspace_size > 0) - Gemm::GemmKernel::initialize_workspace(arguments, workspace.get()); gemm_op.run(); } syclcompat::wait(); @@ -359,8 +354,7 @@ int main(int argc, const char** argv) using GemmKernel = cutlass::gemm::kernel::GemmUniversal< Shape, CollectiveMainloop, - CollectiveEpilogue, - cutlass::gemm::StreamKScheduler + CollectiveEpilogue >; using Gemm = cutlass::gemm::device::GemmUniversalAdapter; diff --git a/examples/sycl/pvc/pvc_gemm_streamk.cpp b/examples/sycl/pvc/pvc_gemm_streamk.cpp new file mode 100644 index 0000000000..a28c6b18d7 --- /dev/null +++ b/examples/sycl/pvc/pvc_gemm_streamk.cpp @@ -0,0 +1,383 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include "cutlass/epilogue/collective/default_epilogue.hpp" +#include "cutlass/epilogue/collective/intel_pvc_epilogue.hpp" +#include "cutlass/epilogue/fusion/intel_pvc_callbacks.hpp" +#include "cutlass/gemm/device/gemm_universal.h" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/collective/collective_mma.hpp" +#include "cutlass/util/GPU_Clock.hpp" + +#include +#include + +#include "cutlass/util/command_line.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/packed_stride.hpp" +#include "cutlass/util/reference/device/gemm_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" +#include "common.h" + +#include "cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp" +using namespace cute; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Command line options parsing +struct Options { + + bool help; + bool error; + bool splitk; + + int m, n, k, l, iterations, splits; + float alpha, beta; + + Options(): + help(false), + error(false), + splitk(false), + m(5120), n(4096), k(4096), l(1), iterations(20), splits(1), + alpha(1.f), beta(0.f) + { } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + return; + } + + if (cmd.check_cmd_line_flag("splitk")) { + splitk = true; + } + + cmd.get_cmd_line_argument("m", m, 5120); + cmd.get_cmd_line_argument("n", n, 4096); + cmd.get_cmd_line_argument("k", k, 4096); + cmd.get_cmd_line_argument("l", l, 1); + cmd.get_cmd_line_argument("splits", splits, 16); + cmd.get_cmd_line_argument("alpha", alpha, 1.f); + cmd.get_cmd_line_argument("beta", beta, 0.f); + cmd.get_cmd_line_argument("iterations", iterations, 100); + cmd.get_cmd_line_argument("splits", splits, 1); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "PVC GEMM Example\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement\n\n" + << " --splitk If specified, uses SplitK decomposition\n" + << " --m= Sets the M extent of the GEMM\n" + << " --n= Sets the N extent of the GEMM\n" + << " --k= Sets the K extent of the GEMM\n" + << " --l= Sets the L extent (batch count) of the GEMM\n" + << " --splits= Sets the splitting factor for GEMM\n" + << " --alpha= Epilogue scalar alpha\n" + << " --beta= Epilogue scalar beta\n\n" + << " --iterations= Iterations\n\n"; + + return out; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template < + class Gemm +> +struct ExampleRunner { + + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideC = typename Gemm::GemmKernel::StrideC; + using StrideD = typename Gemm::GemmKernel::StrideD; + + using LayoutA = typename Gemm::LayoutA; + using LayoutB = typename Gemm::LayoutB; + using LayoutC = typename Gemm::LayoutC; + using LayoutD = typename Gemm::LayoutD; + + using ElementA = typename Gemm::ElementA; + using ElementB = typename Gemm::ElementB; + using ElementAcc = typename Gemm::ElementAccumulator; + + using CollectiveEpilogue = typename Gemm::CollectiveEpilogue; + using ElementC = typename Gemm::ElementC; + using ElementOutput = typename CollectiveEpilogue::ElementOutput; + using ElementCompute = typename CollectiveEpilogue::ElementCompute; + using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator; + + using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape; + + // + // Data members + // + + /// Initialization + StrideA stride_A; + StrideB stride_B; + StrideC stride_C; + StrideD stride_D; + uint64_t seed = 0; + + cutlass::DeviceAllocation block_A; + cutlass::DeviceAllocation block_B; + cutlass::DeviceAllocation block_C; + cutlass::DeviceAllocation block_D; + cutlass::DeviceAllocation block_ref_D; + + // + // Methods + // + + bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) { + auto [M, N, K, L] = problem_size; + + cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K})); + cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N})); + cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N})); + cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N})); + + cutlass::reference::device::GemmComplex( + {M, N, K}, + alpha, + ref_A, + cutlass::ComplexTransform::kNone, + ref_B, + cutlass::ComplexTransform::kNone, + beta, + ref_C, + ref_D, + ElementAccumulator(0), + L, // batch_count + M * K, // batch_stride_A + K * N, // batch_stride_B + M * N, // batch_stride_C + M * N // batch_stride_D + ); + + syclcompat::wait(); + + // Check if output from CUTLASS kernel and reference kernel are equal or not + bool passed = cutlass::reference::device::BlockCompareEqual( + block_ref_D.get(), block_D.get(), block_D.size()); + + return passed; + } + + /// Initialize operands to be used in the GEMM and reference GEMM + void initialize(const ProblemShapeType& problem_size) { + auto problem_shape_MNKL = cute::append<4>(problem_size, 1); + auto [M, N, K, L] = problem_shape_MNKL; + + stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)); + stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L)); + stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); + stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); + + block_A.reset(M * K * L); + block_B.reset(K * N * L); + block_C.reset(M * N * L); + block_D.reset(M * N * L); + block_ref_D.reset(M * N * L); + + initialize_block(block_A, seed + 2023); + initialize_block(block_B, seed + 2022); + initialize_block(block_C, seed + 2021); + } + + void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) { + ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l}; + + initialize(problem_size); + + typename Gemm::GemmKernel::Arguments arguments{ + cutlass::gemm::GemmUniversalMode::kGemm, + problem_size, + {block_A.get(), stride_A, block_B.get(), stride_B}, + {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}, + hw_info, + {options.splits, + options.splitk ? cutlass::gemm::kernel::detail::PersistentTileSchedulerIntelPVCStreamKParams::DecompositionMode::SplitK : + cutlass::gemm::kernel::detail::PersistentTileSchedulerIntelPVCStreamKParams::DecompositionMode::StreamK} + }; + + Gemm gemm_op; + + size_t workspace_size = Gemm::get_workspace_size(arguments); + cutlass::device_memory::allocation workspace(workspace_size); + + gemm_op.can_implement(arguments); + + gemm_op.initialize(arguments, workspace.get()); + + // Run the GEMM + gemm_op.run(); + + syclcompat::wait(); + + // Verify that the result is correct + bool passed = verify(problem_size, options.alpha, options.beta); + std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl; + + if (passed && options.iterations > 0) { + GPU_Clock timer; + timer.start(); + for (int i = 0; i < options.iterations; ++i) { + Gemm::GemmKernel::initialize_workspace(arguments, workspace.get()); + gemm_op.run(); + } + syclcompat::wait(); + + float cute_time = timer.seconds() / options.iterations; + double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12; + std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl; + printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time*1000); + } + + return; + } + +}; + +int main(int argc, const char** argv) +{ + // + // Parse options + // + + Options options; + + options.parse(argc, argv); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + if (options.error) { + std::cerr << "Aborting execution." << std::endl; + return -1; + } + + // + // Run examples + // + + // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This + // information is used by the underlying kernel. + cutlass::KernelHardwareInfo hw_info; + + // Change device_id to another value if you are running on a machine with multiple GPUs and wish + // to use a GPU other than that with device ID 0. + hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id); + + bool passed; + + // The code section below describes datatype for input, output matrices and computation between + // elements in input matrices. + using ElementAccumulator = float; // <- data type of accumulator + using ElementComputeEpilogue = float; // <- data type of epilogue operations + using ElementInputA = bfloat16_t; // <- data type of elements in input matrix A + using ElementInputB = bfloat16_t; // <- data type of elements in input matrix B + using ElementOutput = float; // <- data type of elements in output matrix D + + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::RowMajor; + using LayoutC = cutlass::layout::RowMajor; + using LayoutD = cutlass::layout::RowMajor; + + using GmemTiledCopyA = XE_2D_U16x8x16x4x2_LD_N; + using GmemTiledCopyB = XE_2D_U16x16x16x2x2_V; + + // Workgroup-level tile + using TileShape = Shape<_256, _256, _32>; + + using TiledMma = TiledMMA, + Layout>, + Tile<_32,_64,_32>>; // Subgroup level-tile + + constexpr int PipelineStages = 3; + using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelPVC; + using EpilogueDispatchPolicy = cutlass::epilogue::IntelPVCEpilogue; + + using EpilogueOp = cutlass::epilogue::fusion::LinearCombination; + + using FusionCallBacks = cutlass::epilogue::fusion::FusionCallbacks; + using CollectiveEpilogue = cutlass::epilogue::collective::CollectiveEpilogue< + EpilogueDispatchPolicy, + TileShape, + ElementAccumulator, + cutlass::gemm::TagToStrideC_t, + ElementOutput, + cutlass::gemm::TagToStrideC_t, + FusionCallBacks, + XE_2D_U32x8x16x1x1_LD_N, + void, void, + XE_2D_U32x8x16x1x1_ST_N, + void, void>; + +// Mainloop + using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma< + GEMMDispatchPolicy, + TileShape, + ElementInputA, + cutlass::gemm::TagToStrideA_t, + ElementInputB, + cutlass::gemm::TagToStrideB_t, + TiledMma, + GmemTiledCopyA, void, void, cute::identity, // A + GmemTiledCopyB, void, void, cute::identity // B + >; + + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + Shape, + CollectiveMainloop, + CollectiveEpilogue, + cutlass::gemm::StreamKScheduler + >; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + ExampleRunner runner; + + runner.run(options, hw_info); + + return 0; +} From e8b2d2496586c29bb69bebd3dac7b1a3d2382f5f Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Fri, 6 Sep 2024 16:39:20 +0100 Subject: [PATCH 21/30] Address feedback for split barrier --- cmake/FindDPCPP.cmake | 2 +- include/cutlass/arch/barrier.h | 15 ++++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake index ba60ed2f8b..3574ea1604 100644 --- a/cmake/FindDPCPP.cmake +++ b/cmake/FindDPCPP.cmake @@ -58,7 +58,7 @@ if(NOT "${DPCPP_SYCL_ARCH}" STREQUAL "") endif() if("${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_pvc") - list(APPEND DPCPP_FLAGS "-Xspirv-translator;--spirv-ext=+SPV_INTEL_split_barrier;-DSPV_INTEL_split_barrier;") + list(APPEND DPCPP_FLAGS "-Xspirv-translator;--spirv-ext=+SPV_INTEL_split_barrier") endif() if(UNIX) diff --git a/include/cutlass/arch/barrier.h b/include/cutlass/arch/barrier.h index 756f21971d..db2777e24a 100644 --- a/include/cutlass/arch/barrier.h +++ b/include/cutlass/arch/barrier.h @@ -38,8 +38,13 @@ #include #if defined SYCL_INTEL_TARGET -SYCL_EXTERNAL __attribute__((convergent)) void __spirv_ControlBarrierWaitINTEL(int, int, int); -SYCL_EXTERNAL __attribute__((convergent)) void __spirv_ControlBarrierArriveINTEL(int, int, int); +SYCL_EXTERNAL __attribute__((convergent)) void __spirv_ControlBarrierWaitINTEL(int execution_scope, int memory_scope, int memory_semantics); +SYCL_EXTERNAL __attribute__((convergent)) void __spirv_ControlBarrierArriveINTEL(int execution_scope, int memory_scope, int memory_semantics); + +#define EXECUTION_SCOPE_WORK_GROUP 2 +#define MEMORY_SCOPE_WORK_GROUP 2 +#define MEMORY_SEMANTICS_RELAXED 0 + #elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && (__CUDACC_VER_MAJOR__ >= 12) #define CUDA_BARRIER_ENABLED 1 #else @@ -156,8 +161,8 @@ class NamedBarrier { CUTLASS_DEVICE static void arrive_and_wait_internal(uint32_t num_threads, uint32_t barrier_id) { #if defined SYCL_INTEL_TARGET - __spirv_ControlBarrierArriveINTEL(2, 2, 0x0); - __spirv_ControlBarrierWaitINTEL(2, 2, 0x0); + __spirv_ControlBarrierArriveINTEL(EXECUTION_SCOPE_WORK_GROUP, MEMORY_SCOPE_WORK_GROUP, MEMORY_SEMANTICS_RELAXED); + __spirv_ControlBarrierWaitINTEL(EXECUTION_SCOPE_WORK_GROUP, MEMORY_SCOPE_WORK_GROUP, MEMORY_SEMANTICS_RELAXED); #elif defined CUDA_BARRIER_ENABLED asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "r"(num_threads)); #elif defined(__CUDA_ARCH__) @@ -168,7 +173,7 @@ class NamedBarrier { CUTLASS_DEVICE static void arrive_internal(uint32_t num_threads, uint32_t barrier_id) { #if defined SYCL_INTEL_TARGET - __spirv_ControlBarrierArriveINTEL(2, 2, 0x0); + __spirv_ControlBarrierArriveINTEL(EXECUTION_SCOPE_WORK_GROUP, MEMORY_SCOPE_WORK_GROUP, MEMORY_SEMANTICS_RELAXED); #elif CUDA_BARRIER_ENABLED asm volatile("bar.arrive %0, %1;" : : "r"(barrier_id), "r"(num_threads)); #elif defined(__CUDA_ARCH__) From c3875e7330edae164a03bb7ac40bf1a51938e885 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Mon, 9 Sep 2024 10:04:00 +0100 Subject: [PATCH 22/30] Fix address space for atomicAdd * Instantiate new accumulator registers per iteration --- include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp | 4 +--- include/cutlass/gpu_generics.h | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp index f699a8fd6b..549350d893 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp @@ -259,8 +259,6 @@ class GemmUniversal< CollectiveMainloop collective_mma; CollectiveEpilogue epilogue{params.epilogue, shared_storage.epilogue}; - Tensor accumulators = make_tensor(Shape, Int, Int>{}); - const int m_offset = sub_group_id / CollectiveMainloop::sg_per_wg_n * get<0>(subgroup_shape); const int n_offset = sub_group_id % CollectiveMainloop::sg_per_wg_n * get<1>(subgroup_shape); @@ -290,7 +288,7 @@ class GemmUniversal< auto n_max_coord = N - get<1>(subgroup_shape) * n_coord; // N - SUB_N * n_coord auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue); - clear(accumulators); + Tensor accumulators = make_tensor(Shape, Int, Int>{}); // Perform the collective scoped MMA collective_mma( diff --git a/include/cutlass/gpu_generics.h b/include/cutlass/gpu_generics.h index a406791437..44b5a92acb 100644 --- a/include/cutlass/gpu_generics.h +++ b/include/cutlass/gpu_generics.h @@ -317,7 +317,7 @@ using dim3 = syclcompat::dim3; template CUTLASS_DEVICE T atomicAdd(T *address, T val) { #if defined(__SYCL_DEVICE_ONLY__) - return syclcompat::atomic_fetch_add(address, val); + return syclcompat::atomic_fetch_add(address, val); #endif return 0; } From 7cfbf62e4f791698d3be75742dec78c721df9f44 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Mon, 9 Sep 2024 12:03:06 +0100 Subject: [PATCH 23/30] Renamed the pipeline file --- include/cutlass/gemm/kernel/gemm_universal.hpp | 2 +- ...ntel_pvc_gemm_streamk.hpp => intel_pvc_gemm_cooperative.hpp} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename include/cutlass/gemm/kernel/{intel_pvc_gemm_streamk.hpp => intel_pvc_gemm_cooperative.hpp} (100%) diff --git a/include/cutlass/gemm/kernel/gemm_universal.hpp b/include/cutlass/gemm/kernel/gemm_universal.hpp index 3b601005d4..0d063db130 100644 --- a/include/cutlass/gemm/kernel/gemm_universal.hpp +++ b/include/cutlass/gemm/kernel/gemm_universal.hpp @@ -65,6 +65,6 @@ struct IsCutlass3ArrayKernel Date: Mon, 16 Sep 2024 16:19:57 +0100 Subject: [PATCH 24/30] Renamed files to xe_* * Removed l2 workspace alignment --- examples/sycl/pvc/pvc_gemm_streamk.cpp | 7 +++---- .../cutlass/gemm/kernel/gemm_universal.hpp | 2 +- .../cutlass/gemm/kernel/tile_scheduler.hpp | 4 ++-- ...ooperative.hpp => xe_gemm_cooperative.hpp} | 0 ...sistent_tile_scheduler_params_streamk.hpp} | 19 +++++-------------- ...eamk.hpp => xe_tile_scheduler_streamk.hpp} | 12 ++++++------ 6 files changed, 17 insertions(+), 27 deletions(-) rename include/cutlass/gemm/kernel/{intel_pvc_gemm_cooperative.hpp => xe_gemm_cooperative.hpp} (100%) rename include/cutlass/gemm/kernel/{intel_pvc_persistent_tile_scheduler_params_streamk.hpp => xe_persistent_tile_scheduler_params_streamk.hpp} (97%) rename include/cutlass/gemm/kernel/{intel_pvc_tile_scheduler_streamk.hpp => xe_tile_scheduler_streamk.hpp} (98%) diff --git a/examples/sycl/pvc/pvc_gemm_streamk.cpp b/examples/sycl/pvc/pvc_gemm_streamk.cpp index a28c6b18d7..2c09eb6731 100644 --- a/examples/sycl/pvc/pvc_gemm_streamk.cpp +++ b/examples/sycl/pvc/pvc_gemm_streamk.cpp @@ -47,7 +47,7 @@ #include "cutlass/util/reference/device/tensor_compare.h" #include "common.h" -#include "cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp" +#include "cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp" using namespace cute; /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -87,7 +87,6 @@ struct Options { cmd.get_cmd_line_argument("n", n, 4096); cmd.get_cmd_line_argument("k", k, 4096); cmd.get_cmd_line_argument("l", l, 1); - cmd.get_cmd_line_argument("splits", splits, 16); cmd.get_cmd_line_argument("alpha", alpha, 1.f); cmd.get_cmd_line_argument("beta", beta, 0.f); cmd.get_cmd_line_argument("iterations", iterations, 100); @@ -232,8 +231,8 @@ struct ExampleRunner { {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}, hw_info, {options.splits, - options.splitk ? cutlass::gemm::kernel::detail::PersistentTileSchedulerIntelPVCStreamKParams::DecompositionMode::SplitK : - cutlass::gemm::kernel::detail::PersistentTileSchedulerIntelPVCStreamKParams::DecompositionMode::StreamK} + options.splitk ? cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::SplitK : + cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::StreamK} }; Gemm gemm_op; diff --git a/include/cutlass/gemm/kernel/gemm_universal.hpp b/include/cutlass/gemm/kernel/gemm_universal.hpp index 0d063db130..eced02115a 100644 --- a/include/cutlass/gemm/kernel/gemm_universal.hpp +++ b/include/cutlass/gemm/kernel/gemm_universal.hpp @@ -65,6 +65,6 @@ struct IsCutlass3ArrayKernel { - using Scheduler = PersistentTileSchedulerIntelPVCStreamK; + using Scheduler = PersistentTileSchedulerXeStreamK; }; #endif diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm_cooperative.hpp b/include/cutlass/gemm/kernel/xe_gemm_cooperative.hpp similarity index 100% rename from include/cutlass/gemm/kernel/intel_pvc_gemm_cooperative.hpp rename to include/cutlass/gemm/kernel/xe_gemm_cooperative.hpp diff --git a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp b/include/cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp similarity index 97% rename from include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp rename to include/cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp index 85fcfdbefb..443e2cf100 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp +++ b/include/cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp @@ -48,8 +48,8 @@ namespace kernel { namespace detail { //////////////////////////////////////////////////////////////////////////////// -// Parameters for Intel PVC persistent stream-K scheduler -struct PersistentTileSchedulerIntelPVCStreamKParams { +// Parameters for Xe persistent stream-K scheduler +struct PersistentTileSchedulerXeStreamKParams { // Strategies for computing reductions between work-groups computing portions of a given output tile enum class ReductionMode { @@ -88,7 +88,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { FastDivmodU64 divmod_blk_major_{}; // Divide up the number of stream-K tiles amongst G groups of stream-K units. - // Currently defaults to 1 since we don't create groups for PVC. + // Currently defaults to 1 since we don't create groups for Xe. FastDivmodU64 divmod_sk_groups_{}; // Number of stream-K units in each group @@ -464,7 +464,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { static size_t get_barrier_workspace_size(uint64_t num_tiles, uint32_t barrier_bits) { size_t workspace_bits = num_tiles * static_cast(barrier_bits); - return round_up_to_l2_alignment(bits_to_bytes(workspace_bits)); + return bits_to_bytes(workspace_bits); } // Calculates the size of the workspace needed for holding partial outputs from splits @@ -473,7 +473,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { get_reduction_workspace_size(uint64_t num_tiles, GemmCoord tile_shape, uint32_t accumulator_bits, uint32_t num_accumulator_mtxs = 1) { size_t output_tile_size = tile_shape.m() * tile_shape.n(); size_t workspace_bits = accumulator_bits * output_tile_size * num_tiles * num_accumulator_mtxs; - return round_up_to_l2_alignment(bits_to_bytes(workspace_bits)); + return bits_to_bytes(workspace_bits); } static void @@ -695,15 +695,6 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { sk_units_ = 0; divmod_sk_units_per_group_ = FastDivmodU64(blocks_m * blocks_n * blocks_l); } - - private: - // Round up number of bytes to the nearest multiple of L2 cache line alignment - CUTLASS_HOST_DEVICE - static size_t - round_up_to_l2_alignment(size_t bytes) { - constexpr size_t L2CacheLineSizeBytes = 128u; - return (bytes + L2CacheLineSizeBytes - 1) / L2CacheLineSizeBytes * L2CacheLineSizeBytes; - } }; //////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/xe_tile_scheduler_streamk.hpp similarity index 98% rename from include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp rename to include/cutlass/gemm/kernel/xe_tile_scheduler_streamk.hpp index 69ad69a374..1caf2cba0a 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp +++ b/include/cutlass/gemm/kernel/xe_tile_scheduler_streamk.hpp @@ -38,7 +38,7 @@ #include "cutlass/kernel_hardware_info.hpp" #include "cute/layout.hpp" #include "cute/tensor.hpp" -#include "cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp" +#include "cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp" namespace cutlass::gemm::kernel::detail { @@ -46,7 +46,7 @@ namespace cutlass::gemm::kernel::detail { template < class TileShape > -class PersistentTileSchedulerIntelPVCStreamK { +class PersistentTileSchedulerXeStreamK { // // Data members // @@ -59,7 +59,7 @@ class PersistentTileSchedulerIntelPVCStreamK { // Use a dummy barrier manager to simply get the type used to store the barrier using BarrierType = typename NamedBarrierManager<1>::T; - using Params = PersistentTileSchedulerIntelPVCStreamKParams; + using Params = PersistentTileSchedulerXeStreamKParams; using ReductionMode = Params::ReductionMode; using DecompositionMode = Params::DecompositionMode; @@ -180,10 +180,10 @@ class PersistentTileSchedulerIntelPVCStreamK { } CUTLASS_HOST_DEVICE - PersistentTileSchedulerIntelPVCStreamK() { }; + PersistentTileSchedulerXeStreamK() { }; CUTLASS_HOST_DEVICE - PersistentTileSchedulerIntelPVCStreamK(Params const& params_) : scheduler_params(params_) { + PersistentTileSchedulerXeStreamK(Params const& params_) : scheduler_params(params_) { current_work_linear_idx_ = uint64_t(BlockIdxX()); } @@ -324,7 +324,7 @@ template int barrier_group_thread_idx = ThreadIdxX(); // Reductions use BlockStripedReduce with a width of BarrierManager::ThreadCount under the hood. - // Thus, the start of the reduction space is the same across all threads in a warp group. + // Thus, the start of the reduction space is the same across all threads in a work group. int reduction_offset = (cute::size<0>(TileShape{}) * cute::size<1>(TileShape{}) * reduction_tile_idx * num_accumulator_mtxs) + reduction_peer_offset; From e2a0d9b3524c220009980c1a81515cc217e1ea27 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Wed, 18 Sep 2024 14:58:12 +0100 Subject: [PATCH 25/30] Update the example to reduce caching effects --- examples/sycl/pvc/common.h | 4 ++ examples/sycl/pvc/pvc_gemm_streamk.cpp | 67 ++++++++++++++++++-------- 2 files changed, 52 insertions(+), 19 deletions(-) diff --git a/examples/sycl/pvc/common.h b/examples/sycl/pvc/common.h index cd11b1c7c9..915728da0d 100644 --- a/examples/sycl/pvc/common.h +++ b/examples/sycl/pvc/common.h @@ -58,3 +58,7 @@ bool initialize_block( block.get(), block.size(), seed, scope_max, scope_min, 0); return true; } + +size_t get_llc_size() { + return syclcompat::get_default_queue().get_device().get_info(); +} diff --git a/examples/sycl/pvc/pvc_gemm_streamk.cpp b/examples/sycl/pvc/pvc_gemm_streamk.cpp index 2c09eb6731..ed98dc86d0 100644 --- a/examples/sycl/pvc/pvc_gemm_streamk.cpp +++ b/examples/sycl/pvc/pvc_gemm_streamk.cpp @@ -52,6 +52,8 @@ using namespace cute; /////////////////////////////////////////////////////////////////////////////////////////////////// +#define CUTLASS_SYCL_PROFILING_ENABLED + // Command line options parsing struct Options { @@ -142,6 +144,8 @@ struct ExampleRunner { using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape; + int32_t count; + // // Data members // @@ -153,9 +157,9 @@ struct ExampleRunner { StrideD stride_D; uint64_t seed = 0; - cutlass::DeviceAllocation block_A; - cutlass::DeviceAllocation block_B; - cutlass::DeviceAllocation block_C; + std::vector> block_A; + std::vector> block_B; + std::vector> block_C; cutlass::DeviceAllocation block_D; cutlass::DeviceAllocation block_ref_D; @@ -166,9 +170,9 @@ struct ExampleRunner { bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) { auto [M, N, K, L] = problem_size; - cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K})); - cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N})); - cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N})); + cutlass::TensorRef ref_A(block_A[0].get(), LayoutA::packed({M, K})); + cutlass::TensorRef ref_B(block_B[0].get(), LayoutB::packed({K, N})); + cutlass::TensorRef ref_C(block_C[0].get(), LayoutC::packed({M, N})); cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N})); cutlass::reference::device::GemmComplex( @@ -208,15 +212,27 @@ struct ExampleRunner { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - block_A.reset(M * K * L); - block_B.reset(K * N * L); - block_C.reset(M * N * L); + std::size_t mem_occupied_ABC = (M * K * L * sizeof(ElementA)) + (K * N * L * sizeof(ElementB)) + + (M * N * L * sizeof(ElementC)); + count = std::ceil(static_cast(get_llc_size()) / static_cast(mem_occupied_ABC)); + + for(int i = 0; i < count; i++) { + block_A.emplace_back(); + block_B.emplace_back(); + block_C.emplace_back(); + } + + for (int i = 0; i < count; i++) { + block_A[i].reset(M * K * L); + block_B[i].reset(K * N * L); + block_C[i].reset(M * N * L); + initialize_block(block_A[i], seed + i); + initialize_block(block_B[i], seed + i); + initialize_block(block_C[i], seed + i); + } + block_D.reset(M * N * L); block_ref_D.reset(M * N * L); - - initialize_block(block_A, seed + 2023); - initialize_block(block_B, seed + 2022); - initialize_block(block_C, seed + 2021); } void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) { @@ -227,8 +243,8 @@ struct ExampleRunner { typename Gemm::GemmKernel::Arguments arguments{ cutlass::gemm::GemmUniversalMode::kGemm, problem_size, - {block_A.get(), stride_A, block_B.get(), stride_B}, - {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}, + {block_A[0].get(), stride_A, block_B[0].get(), stride_B}, + {{options.alpha, options.beta}, block_C[0].get(), stride_C, block_D.get(), stride_D}, hw_info, {options.splits, options.splitk ? cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::SplitK : @@ -255,14 +271,27 @@ struct ExampleRunner { if (passed && options.iterations > 0) { GPU_Clock timer; - timer.start(); + float elapsed_time_seconds = 0.f; for (int i = 0; i < options.iterations; ++i) { - Gemm::GemmKernel::initialize_workspace(arguments, workspace.get()); + int32_t idx = std::max(int(0), (i % count) - 1); + typename Gemm::GemmKernel::Arguments arguments{ + cutlass::gemm::GemmUniversalMode::kGemm, + problem_size, + {block_A[idx].get(), stride_A, block_B[idx].get(), stride_B}, + {{options.alpha, options.beta}, block_C[idx].get(), stride_C, block_D.get(), stride_D}, + hw_info, + {options.splits, + options.splitk ? cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::SplitK : + cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::StreamK} + }; + gemm_op.initialize(arguments, workspace.get()); + timer.start(); gemm_op.run(); + syclcompat::wait(); + elapsed_time_seconds += timer.seconds(); } - syclcompat::wait(); - float cute_time = timer.seconds() / options.iterations; + float cute_time = elapsed_time_seconds / options.iterations; double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12; std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl; printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time*1000); From 6d1900082ef2632649528633e5adaabd0fa65920 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Fri, 20 Sep 2024 12:51:47 +0100 Subject: [PATCH 26/30] Refactor pipeline code --- .../gemm/kernel/xe_gemm_cooperative.hpp | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/include/cutlass/gemm/kernel/xe_gemm_cooperative.hpp b/include/cutlass/gemm/kernel/xe_gemm_cooperative.hpp index 549350d893..cd299ce6de 100644 --- a/include/cutlass/gemm/kernel/xe_gemm_cooperative.hpp +++ b/include/cutlass/gemm/kernel/xe_gemm_cooperative.hpp @@ -247,18 +247,6 @@ class GemmUniversal< constexpr auto workgroup_shape = WorkgroupTileShape{}; // (BLK_M,BLK_N,BLK_K) constexpr auto subgroup_shape = SubgroupTileShape{}; // (SUB_M,SUB_N,SUB_K) - constexpr int version = - is_same_v - ? 1 - : 2; - - auto k_residue = K - get<2>(subgroup_shape) * (K / get<2>(subgroup_shape)); // K - SUB_K * k_coord_max - - TiledMma tiled_mma; - CollectiveMainloop collective_mma; - CollectiveEpilogue epilogue{params.epilogue, shared_storage.epilogue}; - const int m_offset = sub_group_id / CollectiveMainloop::sg_per_wg_n * get<0>(subgroup_shape); const int n_offset = sub_group_id % CollectiveMainloop::sg_per_wg_n * get<1>(subgroup_shape); @@ -268,8 +256,8 @@ class GemmUniversal< const int l_coord = work_tile_info.L_idx; // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work. - auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, workgroup_shape); - auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info); + const int work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, workgroup_shape); + const int work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info); auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, make_shape(K)), make_shape(K)); const auto tile_coord = make_coord(m_coord, n_coord, _, l_coord); @@ -278,11 +266,19 @@ class GemmUniversal< make_shape(_1{}, K, L), make_stride(Int{} * get<0>(MmaAtomShape()),_1{})); + constexpr int version = + is_same_v + ? 1 + : 2; + Tensor tBi = params.mainloop.gmem_tiled_copy_b.get_pvc_tensor( make_coord(n_coord, 0, 0), make_shape(Int{}, K, L), make_stride(Int(MmaAtomShape())>{}, _1{})); + auto k_residue = K - get<2>(subgroup_shape) * (K / get<2>(subgroup_shape)); // K - SUB_K * k_coord_max + // Compute tile residues for predication auto m_max_coord = M - get<0>(subgroup_shape) * m_coord; // M - SUB_M * m_coord auto n_max_coord = N - get<1>(subgroup_shape) * n_coord; // N - SUB_N * n_coord @@ -290,6 +286,8 @@ class GemmUniversal< Tensor accumulators = make_tensor(Shape, Int, Int>{}); + CollectiveMainloop collective_mma; + // Perform the collective scoped MMA collective_mma( accumulators, @@ -308,6 +306,9 @@ class GemmUniversal< params.scheduler, work_tile_info, accumulators); if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) { + CollectiveEpilogue epilogue{params.epilogue, shared_storage.epilogue}; + TiledMma tiled_mma; + epilogue( problem_shape_MNKL, subgroup_shape, From c06a28e52eaa04ed04eb113c0fe2439f9fad5c9a Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Thu, 26 Sep 2024 16:05:37 +0100 Subject: [PATCH 27/30] Add the option to invoke data parallel decomposition --- examples/sycl/pvc/pvc_gemm_streamk.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/examples/sycl/pvc/pvc_gemm_streamk.cpp b/examples/sycl/pvc/pvc_gemm_streamk.cpp index ed98dc86d0..94a15e7a71 100644 --- a/examples/sycl/pvc/pvc_gemm_streamk.cpp +++ b/examples/sycl/pvc/pvc_gemm_streamk.cpp @@ -60,6 +60,7 @@ struct Options { bool help; bool error; bool splitk; + bool dp; int m, n, k, l, iterations, splits; float alpha, beta; @@ -68,6 +69,7 @@ struct Options { help(false), error(false), splitk(false), + dp(false), m(5120), n(4096), k(4096), l(1), iterations(20), splits(1), alpha(1.f), beta(0.f) { } @@ -85,6 +87,10 @@ struct Options { splitk = true; } + if (cmd.check_cmd_line_flag("dp")) { + dp = true; + } + cmd.get_cmd_line_argument("m", m, 5120); cmd.get_cmd_line_argument("n", n, 4096); cmd.get_cmd_line_argument("k", k, 4096); @@ -101,7 +107,8 @@ struct Options { out << "PVC GEMM Example\n\n" << "Options:\n\n" << " --help If specified, displays this usage statement\n\n" - << " --splitk If specified, uses SplitK decomposition\n" + << " --dp If specified, uses Data Parallel decomposition\n" + << " --splitk If specified, uses SplitK decomposition\n" << " --m= Sets the M extent of the GEMM\n" << " --n= Sets the N extent of the GEMM\n" << " --k= Sets the K extent of the GEMM\n" @@ -247,6 +254,7 @@ struct ExampleRunner { {{options.alpha, options.beta}, block_C[0].get(), stride_C, block_D.get(), stride_D}, hw_info, {options.splits, + options.dp ? cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::DataParallel : options.splitk ? cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::SplitK : cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::StreamK} }; @@ -281,6 +289,7 @@ struct ExampleRunner { {{options.alpha, options.beta}, block_C[idx].get(), stride_C, block_D.get(), stride_D}, hw_info, {options.splits, + options.dp ? cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::DataParallel : options.splitk ? cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::SplitK : cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::StreamK} }; From 2997eca414910c1d003a9c3e9a4093e2f7410930 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Wed, 23 Oct 2024 19:25:13 +0100 Subject: [PATCH 28/30] Fixing bugs post merge --- examples/sycl/pvc/pvc_gemm_streamk.cpp | 18 ++--- include/cutlass/gemm/collective/xe_mma.hpp | 66 ++++++++++++------- include/cutlass/gemm/kernel/xe_gemm.hpp | 2 + .../gemm/kernel/xe_gemm_cooperative.hpp | 46 +++++-------- 4 files changed, 71 insertions(+), 61 deletions(-) diff --git a/examples/sycl/pvc/pvc_gemm_streamk.cpp b/examples/sycl/pvc/pvc_gemm_streamk.cpp index 94a15e7a71..6f142f98ff 100644 --- a/examples/sycl/pvc/pvc_gemm_streamk.cpp +++ b/examples/sycl/pvc/pvc_gemm_streamk.cpp @@ -30,8 +30,8 @@ **************************************************************************************************/ #include "cutlass/epilogue/collective/default_epilogue.hpp" -#include "cutlass/epilogue/collective/intel_pvc_epilogue.hpp" -#include "cutlass/epilogue/fusion/intel_pvc_callbacks.hpp" +#include "cutlass/epilogue/collective/xe_epilogue.hpp" +#include "cutlass/epilogue/fusion/xe_callbacks.hpp" #include "cutlass/gemm/device/gemm_universal.h" #include "cutlass/gemm/device/gemm_universal_adapter.h" #include "cutlass/gemm/collective/collective_mma.hpp" @@ -358,15 +358,15 @@ int main(int argc, const char** argv) using LayoutC = cutlass::layout::RowMajor; using LayoutD = cutlass::layout::RowMajor; - using GmemTiledCopyA = XE_2D_U16x8x16x4x2_LD_N; - using GmemTiledCopyB = XE_2D_U16x16x16x2x2_V; + using GmemTiledCopyA = XE_2D_U16x8x16_LD_N; + using GmemTiledCopyB = XE_2D_U16x16x16_LD_V; // Workgroup-level tile - using TileShape = Shape<_256, _256, _32>; + using TileShape = Shape<_256, _128, _16>; using TiledMma = TiledMMA, - Layout>, - Tile<_32,_64,_32>>; // Subgroup level-tile + Layout>, + Tile<_64,_32,_16>>; // Subgroup level-tile constexpr int PipelineStages = 3; using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelPVC; @@ -385,9 +385,9 @@ int main(int argc, const char** argv) ElementOutput, cutlass::gemm::TagToStrideC_t, FusionCallBacks, - XE_2D_U32x8x16x1x1_LD_N, + XE_2D_U32x8x16_LD_N, void, void, - XE_2D_U32x8x16x1x1_ST_N, + XE_2D_U32x8x16_ST_N, void, void>; // Mainloop diff --git a/include/cutlass/gemm/collective/xe_mma.hpp b/include/cutlass/gemm/collective/xe_mma.hpp index 61fc7867fc..1df7d6e3a0 100644 --- a/include/cutlass/gemm/collective/xe_mma.hpp +++ b/include/cutlass/gemm/collective/xe_mma.hpp @@ -176,7 +176,8 @@ struct CollectiveMma< class TensorB, class FrgTensorC, class KTileIterator, - class ResidueMNK + class ResidueMNK, + class BlkCoord > CUTLASS_DEVICE void operator() ( @@ -186,6 +187,8 @@ struct CollectiveMma< FrgTensorC const &src_accum, KTileIterator k_tile_iter, int k_tile_count, ResidueMNK residue_mnk, + BlkCoord const &blk_coord, + int const &K, int thread_idx, char *smem_buf, Params const& mainloop) @@ -230,38 +233,53 @@ struct CollectiveMma< // // Mainloop // - const int m_coord = BlockIdxY() * BLK_M + (get_sub_group_id() / ATOM_N) * SG_M; - const int n_coord = BlockIdxX() * BLK_N + (get_sub_group_id() % ATOM_N) * SG_N; - const int l_coord = BlockIdxZ(); + + auto [m_idx, n_idx, k_idx, l_idx] = blk_coord; + const int m_coord = m_idx * BLK_M + (get_sub_group_id() / ATOM_N) * SG_M; + const int n_coord = n_idx * BLK_N + (get_sub_group_id() % ATOM_N) * SG_N; + const int l_coord = l_idx; + Tensor iter_a = mainloop.gmem_tiled_copy_a.get_pvc_tensor( - make_coord(m_coord, 0, l_coord), make_shape(_, size<1>(tCrA_copy_view.shape()), size<2>(tCrA_copy_view.shape()), k_tile_count), - append<3>(typename XE_Copy_A::Shape_MN{}, BLK_K), seq<0,1,1>{}); + make_coord(m_coord, 0, l_coord), + make_shape(_, size<1>(tCrA_copy_view.shape()), + size<2>(tCrA_copy_view.shape()), k_tile_count), + append<3>(typename XE_Copy_A::Shape_MN{}, BLK_K), seq<0,1,1>{}); + Tensor iter_b = mainloop.gmem_tiled_copy_b.get_pvc_tensor( - make_coord(0, n_coord, l_coord), make_shape(_, size<2>(tCrB_copy_view.shape()), size<1>(tCrB_copy_view.shape()), k_tile_count), - append<3>(typename XE_Copy_B::Shape_MN{}, BLK_K), seq<0,1,0>{}); -#pragma unroll - for (int i = 0; i < DispatchPolicy::Stages; i++) { + make_coord(0, n_coord, l_coord), + make_shape(_, size<2>(tCrB_copy_view.shape()), + size<1>(tCrB_copy_view.shape()), k_tile_count), + append<3>(typename XE_Copy_B::Shape_MN{}, BLK_K), seq<0,1,0>{}); + + const int k_start_idx = crd2idx((*k_tile_iter), make_shape(K)); + int prefetch_k = k_start_idx; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < DispatchPolicy::Stages; i++, prefetch_k++) { if constexpr(cute::detail::has_prefetch) { - prefetch(mainloop.gmem_tiled_copy_a, iter_a(_,_,_,i)); + prefetch(mainloop.gmem_tiled_copy_a, iter_a(_,_,_,prefetch_k)); } if constexpr(cute::detail::has_prefetch) { - prefetch(mainloop.gmem_tiled_copy_b, iter_b(_,_,_,i)); + prefetch(mainloop.gmem_tiled_copy_b, iter_b(_,_,_,prefetch_k)); } } -#pragma unroll - for (int k_tile = 0; k_tile < k_tile_count; ++k_tile) { + + CUTLASS_PRAGMA_UNROLL + for (int k_tile = 0, k = k_start_idx; k_tile < k_tile_count; ++k_tile, ++k, ++prefetch_k) { // Copy gmem to rmem for the first k_tile - copy(mainloop.gmem_tiled_copy_a, iter_a(_,_,_,k_tile), tCrA_copy_view); - copy(mainloop.gmem_tiled_copy_b, iter_b(_,_,_,k_tile), tCrB_copy_view); - if(k_tile + DispatchPolicy::Stages < k_tile_count) { - if constexpr(cute::detail::has_prefetch) { - prefetch(mainloop.gmem_tiled_copy_a, iter_a(_,_,_,k_tile + DispatchPolicy::Stages)); - } - if constexpr(cute::detail::has_prefetch) { - prefetch(mainloop.gmem_tiled_copy_b, iter_b(_,_,_,k_tile + DispatchPolicy::Stages)); + copy(mainloop.gmem_tiled_copy_a, iter_a(_,_,_,k), tCrA_copy_view); + copy(mainloop.gmem_tiled_copy_b, iter_b(_,_,_,k), tCrB_copy_view); + + if(prefetch_k < k_tile_count) { + if constexpr(cute::detail::has_prefetch) { + prefetch(mainloop.gmem_tiled_copy_a, iter_a(_,_,_,prefetch_k)); + } + if constexpr(cute::detail::has_prefetch) { + prefetch(mainloop.gmem_tiled_copy_b, iter_b(_,_,_,prefetch_k)); + } } - } - cute::gemm(tiled_mma, accum, tCrA, tCrB, src_accum); + + cute::gemm(tiled_mma, accum, tCrA, tCrB, src_accum); } } }; diff --git a/include/cutlass/gemm/kernel/xe_gemm.hpp b/include/cutlass/gemm/kernel/xe_gemm.hpp index 54aad46709..99f98d807c 100644 --- a/include/cutlass/gemm/kernel/xe_gemm.hpp +++ b/include/cutlass/gemm/kernel/xe_gemm.hpp @@ -248,6 +248,8 @@ class GemmUniversal< accumulators, k_tile_iter, k_tile_count, residue_mnk, + blk_coord_mnkl, + K, thread_idx, smem_buf, params.mainloop diff --git a/include/cutlass/gemm/kernel/xe_gemm_cooperative.hpp b/include/cutlass/gemm/kernel/xe_gemm_cooperative.hpp index cd299ce6de..574992c5be 100644 --- a/include/cutlass/gemm/kernel/xe_gemm_cooperative.hpp +++ b/include/cutlass/gemm/kernel/xe_gemm_cooperative.hpp @@ -243,59 +243,50 @@ class GemmUniversal< auto work_tile_info = scheduler.initial_work_tile_info(); int thread_idx = int(ThreadIdxX()); - int sub_group_id = thread_idx / SubgroupSize; constexpr auto workgroup_shape = WorkgroupTileShape{}; // (BLK_M,BLK_N,BLK_K) constexpr auto subgroup_shape = SubgroupTileShape{}; // (SUB_M,SUB_N,SUB_K) - const int m_offset = sub_group_id / CollectiveMainloop::sg_per_wg_n * get<0>(subgroup_shape); - const int n_offset = sub_group_id % CollectiveMainloop::sg_per_wg_n * get<1>(subgroup_shape); - while (work_tile_info.is_valid()) { - const int m_coord = work_tile_info.M_idx * get<0>(workgroup_shape) + m_offset; - const int n_coord = work_tile_info.N_idx * get<1>(workgroup_shape) + n_offset; + const int m_coord = work_tile_info.M_idx; + const int n_coord = work_tile_info.N_idx; const int l_coord = work_tile_info.L_idx; + const auto tile_coord = make_coord(m_coord, n_coord, _, l_coord); + + Tensor mA_mkl = make_tensor(make_gmem_ptr(static_cast(nullptr)), make_shape(M,K,L), StrideA{}); //(m,k,l) + Tensor mB_nkl = make_tensor(make_gmem_ptr(static_cast(nullptr)), make_shape(N,K,L), StrideB{}); //(n,k,l) + Tensor mA_mk = mA_mkl(_,_,l_coord); // (m,k) + Tensor mB_nk = mB_nkl(_,_,l_coord); // (n,k) + + auto gA = local_tile(mA_mk, workgroup_shape, take<0, 3>(tile_coord), Step<_1, X, _1>{}); + auto gB = local_tile(mB_nk, workgroup_shape, take<0, 3>(tile_coord), Step< X, _1, _1>{}); // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work. const int work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, workgroup_shape); const int work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info); auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, make_shape(K)), make_shape(K)); - const auto tile_coord = make_coord(m_coord, n_coord, _, l_coord); - - Tensor tAi = params.mainloop.gmem_tiled_copy_a.get_pvc_tensor( - make_coord(m_coord, 0, 0), - make_shape(_1{}, K, L), - make_stride(Int{} * get<0>(MmaAtomShape()),_1{})); - - constexpr int version = - is_same_v - ? 1 - : 2; - - Tensor tBi = params.mainloop.gmem_tiled_copy_b.get_pvc_tensor( - make_coord(n_coord, 0, 0), - make_shape(Int{}, K, L), - make_stride(Int(MmaAtomShape())>{}, _1{})); - auto k_residue = K - get<2>(subgroup_shape) * (K / get<2>(subgroup_shape)); // K - SUB_K * k_coord_max + auto k_residue = K - get<2>(subgroup_shape) * (K / get<2>(subgroup_shape)); // K - SUB_K * k_coord_max // Compute tile residues for predication auto m_max_coord = M - get<0>(subgroup_shape) * m_coord; // M - SUB_M * m_coord auto n_max_coord = N - get<1>(subgroup_shape) * n_coord; // N - SUB_N * n_coord auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue); - Tensor accumulators = make_tensor(Shape, Int, Int>{}); + TiledMma tiled_mma; + Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(workgroup_shape)); CollectiveMainloop collective_mma; // Perform the collective scoped MMA collective_mma( accumulators, - tAi(_,_,_,l_coord), - tBi(_,_,_,l_coord), + gA, + gB, accumulators, k_tile_iter, work_k_tile_count, residue_mnk, + tile_coord, + K, thread_idx, smem_buf, params.mainloop @@ -307,7 +298,6 @@ class GemmUniversal< if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) { CollectiveEpilogue epilogue{params.epilogue, shared_storage.epilogue}; - TiledMma tiled_mma; epilogue( problem_shape_MNKL, From 8e20733807b11784418b5c9322aa00cc3f0a181c Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Thu, 24 Oct 2024 06:53:17 +0100 Subject: [PATCH 29/30] Address PR feedback --- include/cutlass/arch/barrier.h | 8 ++++---- .../xe_persistent_tile_scheduler_params_streamk.hpp | 10 ++++------ .../cutlass/gemm/kernel/xe_tile_scheduler_streamk.hpp | 5 ++--- include/cutlass/workspace.h | 2 +- 4 files changed, 11 insertions(+), 14 deletions(-) diff --git a/include/cutlass/arch/barrier.h b/include/cutlass/arch/barrier.h index db2777e24a..0e1f344f27 100644 --- a/include/cutlass/arch/barrier.h +++ b/include/cutlass/arch/barrier.h @@ -37,7 +37,7 @@ #include #include -#if defined SYCL_INTEL_TARGET +#if defined(SYCL_INTEL_TARGET) SYCL_EXTERNAL __attribute__((convergent)) void __spirv_ControlBarrierWaitINTEL(int execution_scope, int memory_scope, int memory_semantics); SYCL_EXTERNAL __attribute__((convergent)) void __spirv_ControlBarrierArriveINTEL(int execution_scope, int memory_scope, int memory_semantics); @@ -160,10 +160,10 @@ class NamedBarrier { private: CUTLASS_DEVICE static void arrive_and_wait_internal(uint32_t num_threads, uint32_t barrier_id) { -#if defined SYCL_INTEL_TARGET +#if defined(SYCL_INTEL_TARGET) __spirv_ControlBarrierArriveINTEL(EXECUTION_SCOPE_WORK_GROUP, MEMORY_SCOPE_WORK_GROUP, MEMORY_SEMANTICS_RELAXED); __spirv_ControlBarrierWaitINTEL(EXECUTION_SCOPE_WORK_GROUP, MEMORY_SCOPE_WORK_GROUP, MEMORY_SEMANTICS_RELAXED); -#elif defined CUDA_BARRIER_ENABLED +#elif CUDA_BARRIER_ENABLED asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "r"(num_threads)); #elif defined(__CUDA_ARCH__) asm volatile ("brkpt;\n" ::); @@ -172,7 +172,7 @@ class NamedBarrier { CUTLASS_DEVICE static void arrive_internal(uint32_t num_threads, uint32_t barrier_id) { -#if defined SYCL_INTEL_TARGET +#if defined(SYCL_INTEL_TARGET) __spirv_ControlBarrierArriveINTEL(EXECUTION_SCOPE_WORK_GROUP, MEMORY_SCOPE_WORK_GROUP, MEMORY_SEMANTICS_RELAXED); #elif CUDA_BARRIER_ENABLED asm volatile("bar.arrive %0, %1;" : : "r"(barrier_id), "r"(num_threads)); diff --git a/include/cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp b/include/cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp index 443e2cf100..b98613b545 100644 --- a/include/cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp +++ b/include/cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp @@ -244,8 +244,6 @@ struct PersistentTileSchedulerXeStreamKParams { // Calculate the number of work units covering the data-parallel and stream-K tiles. // A "work unit" is a single index in the linearized ID space used by the scheduler. - // We distinguish it from a "block," which is typically tied to a hardware unit - // (e.g., the callers into this scheduler will be persistent thread blocks). // A work unit can encompass multiple output tiles worth of work (as will be the // case for stream-K blocks). // Since splitting is not required for data-parallel tiles, only one data-parallel unit @@ -438,12 +436,12 @@ struct PersistentTileSchedulerXeStreamKParams { static uint64_t get_num_sk_units(uint64_t wgs_per_sk_wave, uint32_t sk_tiles, uint32_t k_tiles_per_output_tile) { // If there are stream-K tiles to compute and a sufficiently large number of k iterations - // across them, they will be covered by a single wave of persistent threadblocks. Thus, there - // will be as many work units as there are threadblocks in a single wave. + // across them, they will be covered by a single wave of persistent work_groups. Thus, there + // will be as many work units as there are work_groups in a single wave. // // When the total k iterations across stream-K tiles is too small to justify distributing - // across an entire wave of blocks, we instead distribute the iterations over a smaller - // set of blocks. + // across an entire wave of work_groups, we instead distribute the iterations over a smaller + // set of work_groups. // Calculate the number of stream-K units that would be needed if each stream-K unit // computed the minimum allowable k iterations. diff --git a/include/cutlass/gemm/kernel/xe_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/xe_tile_scheduler_streamk.hpp index 1caf2cba0a..4b405b39cc 100644 --- a/include/cutlass/gemm/kernel/xe_tile_scheduler_streamk.hpp +++ b/include/cutlass/gemm/kernel/xe_tile_scheduler_streamk.hpp @@ -245,8 +245,7 @@ class PersistentTileSchedulerXeStreamK { current_work_linear_idx_ += uint64_t(GridDimX()) * uint64_t(GridDimY()) * uint64_t(GridDimZ()) * uint64_t(advance_count); } - // Given the inputs, computes the total number of output blocks this problem will compute over - // Note that this is only the logical size of our grid, not the physical grid we will actually launch. + // Given the inputs, computes the total number of output work-groups this problem will compute over. template CUTLASS_HOST_DEVICE static dim3 @@ -254,7 +253,7 @@ class PersistentTileSchedulerXeStreamK { return Params::get_tiled_wg_shape_mnl(to_gemm_coord(problem_shape_mnkl), to_gemm_coord(cta_shape)); } - // Given the cluster shape, computes the physical grid we should launch. + // Computes the physical grid we should launch. template CUTLASS_HOST_DEVICE static dim3 diff --git a/include/cutlass/workspace.h b/include/cutlass/workspace.h index bb74826805..79f3aa3d0c 100644 --- a/include/cutlass/workspace.h +++ b/include/cutlass/workspace.h @@ -61,7 +61,7 @@ zero_workspace(void* workspace, size_t workspace_size, cudaStream_t stream = nul CUTLASS_TRACE_HOST(" clearing workspace"); -#if defined CUTLASS_ENABLE_SYCL +#if defined (CUTLASS_ENABLE_SYCL) syclcompat::memset_async(workspace, 0, workspace_size); #elif defined(CUTLASS_ENABLE_CUDA_HOST_ADAPTER) && CUTLASS_ENABLE_CUDA_HOST_ADAPTER // From af74f7a755c573f9aa2ff1f9ed33c0b0d696d419 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Thu, 7 Nov 2024 17:02:12 +0000 Subject: [PATCH 30/30] Fix tile size --- examples/sycl/pvc/pvc_gemm_streamk.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/sycl/pvc/pvc_gemm_streamk.cpp b/examples/sycl/pvc/pvc_gemm_streamk.cpp index 6f142f98ff..3546ab62e3 100644 --- a/examples/sycl/pvc/pvc_gemm_streamk.cpp +++ b/examples/sycl/pvc/pvc_gemm_streamk.cpp @@ -358,15 +358,15 @@ int main(int argc, const char** argv) using LayoutC = cutlass::layout::RowMajor; using LayoutD = cutlass::layout::RowMajor; - using GmemTiledCopyA = XE_2D_U16x8x16_LD_N; - using GmemTiledCopyB = XE_2D_U16x16x16_LD_V; + using GmemTiledCopyA = XE_2D_U16x32x32_LD_N; + using GmemTiledCopyB = XE_2D_U16x32x32_LD_V; // Workgroup-level tile - using TileShape = Shape<_256, _128, _16>; + using TileShape = Shape<_256, _256, _32>; using TiledMma = TiledMMA, - Layout>, - Tile<_64,_32,_16>>; // Subgroup level-tile + Layout>, + Tile<_64,_64,_32>>; // Subgroup level-tile constexpr int PipelineStages = 3; using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelPVC;