From 49a986ab69051ab20e4c0c730752d867dc170376 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Wed, 14 Aug 2024 12:43:01 +0100
Subject: [PATCH 01/30] WIP: Introduce StreamK for PVC

---
 examples/sycl/pvc/pvc_gemm.cpp                |   3 +-
 .../gemm/device/gemm_universal_adapter.h      |   7 +-
 .../cutlass/gemm/kernel/gemm_universal.hpp    |   1 +
 .../cutlass/gemm/kernel/intel_pvc_gemm.hpp    |   3 +-
 .../gemm/kernel/intel_pvc_gemm_streamk.hpp    | 545 +++++++++++
 ...rsistent_tile_scheduler_params_streamk.hpp | 770 ++++++++++++++++
 .../intel_pvc_tile_scheduler_streamk.hpp      | 857 ++++++++++++++++++
 .../cutlass/gemm/kernel/tile_scheduler.hpp    |  16 +
 include/cutlass/workspace.h                   |   4 +-
 9 files changed, 2200 insertions(+), 6 deletions(-)
 create mode 100644 include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp
 create mode 100644 include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp
 create mode 100644 include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
diff --git a/examples/sycl/pvc/pvc_gemm.cpp b/examples/sycl/pvc/pvc_gemm.cpp
index 7959f7c0f4..79678e71b9 100644
--- a/examples/sycl/pvc/pvc_gemm.cpp
+++ b/examples/sycl/pvc/pvc_gemm.cpp
@@ -386,7 +386,8 @@ int main(int argc, const char** argv)
   using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
   Shape<int, int, int, int>,
   CollectiveMainloop,
-  CollectiveEpilogue
+  CollectiveEpilogue,
+  cutlass::gemm::StreamKScheduler
   >;
 
   using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
diff --git a/include/cutlass/gemm/device/gemm_universal_adapter.h b/include/cutlass/gemm/device/gemm_universal_adapter.h
index 40a21b1078..e1bf6ee2d6 100644
--- a/include/cutlass/gemm/device/gemm_universal_adapter.h
+++ b/include/cutlass/gemm/device/gemm_universal_adapter.h
@@ -91,7 +91,8 @@ class GemmUniversalAdapter;
 template <class GemmKernel_>
 class GemmUniversalAdapter<
   GemmKernel_,
-  cute::enable_if_t<gemm::detail::IsCutlass3GemmKernel<GemmKernel_>::value>>
+  // cute::enable_if_t<gemm::detail::IsCutlass3GemmKernel<GemmKernel_>::value>>
+  cute::enable_if_t<true>>
 {
 public:
   using GemmKernel = GemmKernel_;
@@ -505,7 +506,7 @@ class GemmUniversalAdapter<
 ////////////////////////////// CUTLASS 2.x API /////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////
 
-template <class GemmKernel_>
+/*template <class GemmKernel_>
 class GemmUniversalAdapter<
   GemmKernel_,
   cute::enable_if_t<not gemm::detail::IsCutlass3GemmKernel<GemmKernel_>::value>>
@@ -666,7 +667,7 @@ class GemmUniversalAdapter<
 
     return status;
   }
-};
+};*/
 
 ////////////////////////////////////////////////////////////////////////////////
 
diff --git a/include/cutlass/gemm/kernel/gemm_universal.hpp b/include/cutlass/gemm/kernel/gemm_universal.hpp
index bdfefa91cc..3b601005d4 100644
--- a/include/cutlass/gemm/kernel/gemm_universal.hpp
+++ b/include/cutlass/gemm/kernel/gemm_universal.hpp
@@ -65,5 +65,6 @@ struct IsCutlass3ArrayKernel<ProblemShape, cute::void_t<typename ProblemShape::U
 
 #if defined(SYCL_INTEL_TARGET)
 #include "cutlass/gemm/kernel/intel_pvc_gemm.hpp"
+#include "cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp"
 #endif
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp b/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp
index 05b099bc24..dfe6992bce 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp
@@ -52,7 +52,8 @@ class GemmUniversal<
   CollectiveMainloop_,
   CollectiveEpilogue_,
   TileScheduler_,
-  cute::enable_if_t<cute::is_base_of_v<KernelSinglestage, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+  cute::enable_if_t<cute::is_base_of_v<KernelSinglestage, typename CollectiveMainloop_::DispatchPolicy::Schedule> 
+                    && cute::is_same_v<TileScheduler_, cutlass::gemm::PersistentScheduler>>>
 {
 public:
   //
diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp
new file mode 100644
index 0000000000..0e128bb8a5
--- /dev/null
+++ b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp
@@ -0,0 +1,545 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cute/tensor.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelSinglestage, typename CollectiveMainloop_::DispatchPolicy::Schedule> 
+                    && cute::is_same_v<TileScheduler_, cutlass::gemm::StreamKScheduler>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::WorkgroupTileShape;
+  using WorkgroupTileShape = TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+//   static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr uint32_t MaxThreadsPerBlock = CUTE_STATIC_V(size(TiledMma{}));
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  /// Register requirement for Load and Math WGs
+//   static constexpr uint32_t LoadRegisterRequirement = 40;
+//   static constexpr uint32_t MmaRegisterRequirement = 232;
+
+  // 1 stage ordered sequence between mainloop and epilogue producer load threads
+//   using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+    EpilogueTensorStorage epilogue;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+    void* workspace{nullptr};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    auto problem_shape = args.problem_shape;
+
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    void* scheduler_workspace = workspace_ptr;
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info);
+
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+
+    void* mainloop_workspace = nullptr;
+    // Precompute the sub tiles numbers in epilogue, pass into tile scheduler.  Therefore it will be used
+    // in separate reduction scheme for streamk case, NumEpilogueSubTiles default value is 1, which means
+    // subtile will not be used, therefore separate reduction will not be enabled.
+    // constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+    TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments(
+      problem_shape_MNKL, TileShape{}, hw_info, args.scheduler, scheduler_workspace);
+
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, mainloop_workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace),
+      hw_info,
+      scheduler,
+      workspace
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool mode_implementable = args.mode == GemmUniversalMode::kGemm or
+          (args.mode == GemmUniversalMode::kBatched && rank(ProblemShape{}) == 4);
+    return mode_implementable && TileScheduler::can_implement(args.scheduler);
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+
+    workspace_size += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info);
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    status = TileScheduler::template initialize_workspace<ProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, args.problem_shape, args.hw_info);
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
+    // TileSchedulerArguments args{};
+    return TileScheduler::get_grid_shape(params.problem_shape, TileShape{}, params.hw_info);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    printf("I am here\n");
+
+
+/*    using namespace cute;
+    using X = Underscore;
+
+    // Preconditions
+    static_assert(size(TiledMma{}) == 256, "Cooperative kernel must have TiledMMA operating using 256 threads.");
+    static_assert(size<0>(TileShape{}) >= 128,
+        "Cooperative kernel requires Tile Size to be greater than or equal to 128 along the M-dimension.");
+
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Mainloop) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
+    mainloop_pipeline_params.num_consumers = size(TiledMma{});
+    mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Epilogue) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
+    epi_load_pipeline_params.producer_arv_count = NumThreadsPerWarp;
+    epi_load_pipeline_params.consumer_arv_count = size(TiledMma{});
+    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
+      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
+    }
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    typename LoadWarpOrderBarrier::Params params_load_order_barrier;
+    params_load_order_barrier.group_id = producer_warp_role == ProducerWarpRole::Mainloop ? 0 : 1;
+    params_load_order_barrier.group_size = NumThreadsPerWarp;
+    LoadWarpOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, params_load_order_barrier);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    auto cluster_wait_fn = [] () {
+      // We need this to guarantee that the Pipeline init is visible
+      // To all producers and consumer thread blocks in the Cluster
+      if constexpr (size(ClusterShape{}) > 1) {
+        cute::cluster_arrive_relaxed();
+        return [] () { cute::cluster_wait(); };
+      }
+      else {
+        syncthreads();
+        return [] () {}; // do nothing
+      }
+    } ();
+
+    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    TiledMma tiled_mma;
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+
+    TileScheduler scheduler{params.scheduler};
+    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+
+    // Prepare and partition the input tensors. Expects a tuple of tensors where:
+    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k,l)
+    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k,l)
+    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
+    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
+
+    // Extract out partitioned A and B.
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    // Wait for all thread blocks in the Cluster
+    cluster_wait_fn();
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
+
+      CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+      // Mainloop Producer Warp
+      if (producer_warp_role == ProducerWarpRole::Mainloop) {
+        bool do_load_order_arrive = true;
+        while (work_tile_info.is_valid()) {
+          if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+            work_tile_info = scheduler.fetch_next_work(work_tile_info);
+            continue;
+          }
+
+          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+          auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+          auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+          // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+          auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+          auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+          auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+
+          collective_mainloop.load(
+            params.mainloop,
+            mainloop_pipeline,
+            mainloop_pipe_producer_state,
+            load_inputs,
+            blk_coord,
+            k_tile_iter, work_k_tile_count,
+            lane_idx,
+            block_rank_in_cluster,
+            shared_storage.tensors.mainloop
+          );
+          // Update starting pipeline state for the next tile
+          mainloop_pipe_producer_state.advance(work_k_tile_count);
+
+          // Signal for the epilogue load warp to begin
+          if (do_load_order_arrive) {
+            load_order_barrier.arrive();
+            do_load_order_arrive = false;
+          }
+
+          // Get next work tile
+          work_tile_info = scheduler.fetch_next_work(work_tile_info);
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+
+      } // Mainloop Producer Warp End
+
+      // Epilogue Producer Warp
+      else if (producer_warp_role == ProducerWarpRole::Epilogue && collective_epilogue.is_producer_load_needed()) {
+
+        if (!TileScheduler::requires_separate_reduction(params.scheduler) && work_tile_info.is_valid()) {
+          load_order_barrier.wait();
+        }
+        while (work_tile_info.is_valid()) {
+          if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+            // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+            auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+            auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+            auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+            auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+            epi_load_pipe_producer_state =
+            collective_epilogue.load(
+              epi_load_pipeline,
+              epi_load_pipe_producer_state,
+              problem_shape_MNKL,
+              blk_shape,
+              blk_coord,
+              tiled_mma,
+              lane_idx,
+              shared_storage.tensors.epilogue,
+              work_tile_info.reduction_subtile_idx()
+            );
+          }
+
+          // Get next work tile
+          work_tile_info = scheduler.fetch_next_work(work_tile_info);
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      } // Epilogue Producer Warp End
+    } // Producer Warp Group End
+
+    else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
+
+      CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+      // Do we potentially issue tail arrives for TMA stores, if epilogue load is waiting for it
+      bool do_store_tail = false;
+      while (work_tile_info.is_valid()) {
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+
+        // Allocate the accumulators for the (M,N) blk_shape
+        //
+        // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
+        auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
+        if(TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+          collective_mainloop.mma(
+            mainloop_pipeline,
+            mainloop_pipe_consumer_state,
+            accumulators,
+            work_k_tile_count,
+            mma_thread_idx,
+            shared_storage.tensors.mainloop,
+            params.mainloop
+          );
+
+          // Make sure the math instructions are done and free buffers before entering the epilogue
+          collective_mainloop.mma_tail(
+            mainloop_pipeline,
+            mainloop_pipe_consumer_state,
+            work_k_tile_count
+          );
+
+          // Update starting mainloop pipeline state for the next tile
+          mainloop_pipe_consumer_state.advance(work_k_tile_count);
+        }
+        // Index of warp group within consumer warp groups
+        int consumer_warp_group_idx = canonical_warp_group_idx() - NumLoadWarpGroups;
+
+        // Perform reduction across splits, if needed
+        TileScheduler::fixup(
+          params.scheduler, work_tile_info, accumulators, NumMmaWarpGroups, consumer_warp_group_idx);
+
+        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+          // Epilogue and write to gD
+          auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
+          collective_epilogue.store(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            accumulators,
+            tiled_mma,
+            mma_thread_idx,
+            shared_storage.tensors.epilogue,
+            work_tile_info.reduction_subtile_idx()
+          );
+          epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next;
+          epi_store_pipe_producer_state = epi_store_pipe_producer_state_next;
+          do_store_tail = true;
+        }
+
+        // Get next work tile
+        work_tile_info = scheduler.fetch_next_work(work_tile_info);
+      } // Scheduler work fetch loop
+
+      if (do_store_tail) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state
+        );
+      }
+    } // Consumer Warp Groups End
+    */
+  }
+
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp
new file mode 100644
index 0000000000..96e1368b2b
--- /dev/null
+++ b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp
@@ -0,0 +1,770 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+/*! \file
+    \brief Parameters structures for persistent tile schedulers
+*/
+
+#include "cutlass/coord.h"
+#include "cutlass/kernel_hardware_info.h"
+#include "cutlass/workspace.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm_coord.h"
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+namespace detail {
+
+////////////////////////////////////////////////////////////////////////////////
+// Parameters for Intel PVC persistent stream-K scheduler
+struct PersistentTileSchedulerIntelPVCStreamKParams {
+
+  // Strategies for computing reductions between CTAs computing portions of a given output tile
+  enum class ReductionMode {
+    // Participating CTAs perform reduction in a turnstile fashion in order of the K extent
+    // covered by each CTA. This requires a lock to be held exclusively be the CTA that is
+    // currently accumulating.
+    //
+    // Turnstile accumulation ensures deterministic numeric behavior when using this mode.
+    Deterministic,
+
+    // Participating CTAs perform reduction atomically to the same workspace (mostly) without locking.
+    // Locks are used only to wait for the first CTA to write its partial values (to initialize the
+    // workspace), and for all but the final CTA to have accumulated (so that the final CTA can load
+    // the accumulated value and accumulate it into registers on top of which the epilogue will
+    // be performed).
+    //
+    // Due to the nondeterminsitic ordering of accumulation, deterministic numeric behavior cannot
+    // be guaranteed with this mode (e.g., floating-point rounding error will depend on the order
+    // of accumulation)
+    Nondeterministic
+  };
+
+  // Strategies for decomposing the problem
+  enum class DecompositionMode {
+    // Use a heuristic to determine whether data-parallel, split-K, or stream-K decomposition should be performed
+    Heuristic,
+    // Force a data-parallel decomposition
+    DataParallel,
+    // Force a split-K decomposition. This should be paired with setting the `splits` parameter
+    SplitK,
+    // Force a stream-K decomposition
+    StreamK
+  };
+
+  FastDivmodU64 divmod_batch_{};
+
+  // We divide up the number of stream-K tiles amongst G groups of stream-K units.
+  // The stream-K units within a group collaborate to comptue over the `sk_tiles / G`
+  // tiles assigned to that group. Non-unit group sizes can help to preserve L2 locality of
+  // partial chunks computed by stream-K units -- units 0 in each group will compute identical K extents
+  // of tiles that would be assigned in the same wave according to the rasterization order of the
+  // data-parallel formulation of the problem.
+  FastDivmodU64 divmod_sk_groups_{};
+
+  // Number of stream-K units in each group
+  FastDivmodU64 divmod_sk_units_per_group_{};
+
+  uint64_t units_per_problem_ = 0;
+  FastDivmod divmod_tiles_per_output_tile_{};
+
+  // The splitting factor to be used in a split-K decomposition of the problem.
+  // If this is set to a value greater than 1, stream-K decomposition logic
+  // is bypassed in favor of a split-K decomposition.
+  FastDivmod divmod_splits_{};
+
+  // Number of stream-K or split-K work units that compute an extra k iteration.
+  // This is done to handle residuals in dividing up the k iteration space.
+  // For stream-K, since the actual assignment of work to stream-K units will be done
+  // at the granularity of a cluster, we store only the number of big clusters.
+  uint32_t big_units_ = 0;
+
+  // The number of groups of stream-K units that will process an extra stream-K tile cluster.
+  uint32_t big_groups_ = 0;
+
+  // Workspace for holding partial accumulators to be reduced across stream-K/split-K units
+  void* reduction_workspace_ = nullptr;
+
+  // Number of tiles covered by stream-K work units
+  uint32_t sk_tiles_ = 0;
+
+  // Number of work units computing stream-K tiles
+  uint32_t sk_units_ = 0;
+
+  // Number of tiled k iterations computed by each stream-K work unit. This
+  // can potentially cover more than one output tile.
+  FastDivmod divmod_k_tiles_per_sk_unit_{};
+  // Number of tiled k iterations computed by each "big" stream-K units, which
+  // processes one more K chunk than a "normal" stream-K unit.
+  FastDivmod divmod_k_tiles_per_sk_big_unit_{};
+
+  // Strategy to use when reducing between collaborating CTAs
+  ReductionMode reduction_mode_ = ReductionMode::Deterministic;
+
+  // Minimum number of k tiles that can be assigned to a stream-K unit
+  static constexpr uint32_t min_iters_per_sk_unit_ = 8u;
+
+  // Maximum number of groups of stream-K units
+  static constexpr uint32_t max_sk_groups_ = 8u;
+
+  // ktile start from even for each cta
+  uint32_t ktile_start_alignment_count { 1u };
+
+  // Returns the maximum number of peers that can collaborate on a given output tile
+  CUTLASS_HOST_DEVICE
+  static uint32_t
+  max_peers_per_tile(uint64_t sk_units, uint64_t sk_tiles) {
+    // When we can divide up our SK units to SK tiles evenly, the number of peers
+    // per SK tile is exactly (sk_units_ / sk_tiles_). In cases where this division
+    // is not exact, some tiles will need to be covered by additional SK units. Because
+    // the extra work can occur at both the beginning and the end of the SK tile, at
+    // most 2 extra peers will be needed.
+    return static_cast<uint32_t>(sk_units / sk_tiles + 2);
+  }
+
+  // Initializes members. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  void
+  initialize(
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    KernelHardwareInfo hw_info,
+    int splits,
+    ReductionMode reduction_mode,
+    DecompositionMode decomposition_mode,
+    void* workspace
+  ) {
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, tile_shape);
+    // Number of k tiles in each output tile
+    uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
+
+    initialize(
+      problem_blocks,
+      k_tiles_per_output_tile,
+      hw_info,
+      splits,
+      reduction_mode,
+      decomposition_mode,
+      workspace
+    );
+  }
+
+  // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  void
+  initialize(
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    KernelHardwareInfo hw_info,
+    int splits,
+    ReductionMode reduction_mode,
+    DecompositionMode decomposition_mode,
+    void* workspace
+  ) {
+
+    auto problem_blocks_l = problem_blocks.z;
+
+    auto problem_blocks_m = problem_blocks.x;
+    auto problem_blocks_n = problem_blocks.y;
+    uint64_t output_tiles = problem_blocks_m * problem_blocks_n * problem_blocks_l;
+
+    // Reduction workspace is at the beginning of the workspace. Lock workspace follows.
+    void* reduction_workspace = workspace;
+
+    if (decomposition_mode == DecompositionMode::SplitK ||
+        (decomposition_mode == DecompositionMode::Heuristic && splits > 1)) {
+      // Short circuit to basic split-K decomposition
+
+      // Don't split by more than the available number of SMs
+      if (splits > hw_info.sm_count) {
+        splits = hw_info.sm_count;
+      }
+
+      // Don't split by more than the K tile iterations
+      //
+      // splits is almost certainly nonnegative here (e.g., hw_info.sm_count,
+      // despite being an int, is a count), so it can safely be converted to unsigned
+      // in the comparison to avoid a signed-unsigned comparison warning-as-error.
+      if (static_cast<decltype(k_tiles_per_output_tile)>(splits) > k_tiles_per_output_tile) {
+        splits = k_tiles_per_output_tile;
+      }
+
+      // If splits == k_tiles_per_output_tiles, there will be one k_tile per cta
+      //   and this violate k_tile start from even requirements. Thus we need to
+      //   reduce the number of splits.
+      if (ktile_start_alignment_count > 1u &&
+           static_cast<decltype(k_tiles_per_output_tile)>(splits) == k_tiles_per_output_tile) { 
+        splits = k_tiles_per_output_tile / ktile_start_alignment_count;
+      }
+
+      set_params_basic(
+        problem_blocks_m,
+        problem_blocks_n,
+        problem_blocks_l,
+        splits,
+        k_tiles_per_output_tile,
+        reduction_workspace,
+        reduction_mode
+      );
+      return;
+    }
+
+    // Calculate the maximum number of blocks from clusters of shape cluster_shape that we
+    // can fit within sm_count SMs.
+    dim3 grid = get_grid_shape(
+      problem_blocks,
+      hw_info
+    );
+
+    uint64_t ctas_per_wave = grid.x * grid.y;
+    // The number of output tiles to be computed in stream-K and data-parallel fashion, respectively.
+    uint32_t sk_tiles = get_num_sk_tiles(
+      output_tiles,
+      ctas_per_wave,
+      k_tiles_per_output_tile,
+      decomposition_mode
+    );
+    uint64_t dp_tiles = output_tiles - sk_tiles;
+
+    // Calculate the number of work units covering the data-parallel and stream-K tiles.
+    // A "work unit" is a single index in the linearized ID space used by the scheduler.
+    // We distinguish it from a "block," which is typically tied to a hardware unit
+    // (e.g., the callers into this scheduler will be persistent thread blocks).
+    // A work unit can encompass multiple output tiles worth of work (as will be the
+    // case for stream-K blocks).
+    // Since splitting is not required for data-parallel tiles, only one data-parallel unit
+    // is needed per data-parallel tile.
+    uint64_t dp_units = dp_tiles;
+
+    uint64_t ctas_per_sk_wave = ctas_per_wave;
+    uint64_t sk_units = get_num_sk_units(ctas_per_sk_wave, sk_tiles, k_tiles_per_output_tile);
+
+    if (decomposition_mode == DecompositionMode::DataParallel ||
+        (decomposition_mode == DecompositionMode::Heuristic && sk_tiles == 0) ||
+        sk_units == 0) {
+      // Short circuit to basic data-parallel decomposition
+      set_params_basic(
+        problem_blocks_m,
+        problem_blocks_n,
+        problem_blocks_l,
+        /* splits = */ 1,
+        k_tiles_per_output_tile,
+        reduction_workspace,
+        reduction_mode
+      );
+      return;
+    }
+
+    bool do_separate_reduction = false; 
+    // should_perform_separate_reduction(
+    //   epilogue_subtile, sk_units, sk_tiles, dp_tiles, ctas_per_wave);
+
+    // Determine the number of stream-K groups that will be used. Choosing the
+    // fast moving dimension of the underlying grid.
+    uint32_t max_groups_problem = problem_blocks_n;
+
+    // Select the number of groups that will be use. We start with the maximum
+    // number of potential groups, and iterate down looking for a group size that
+    // evenly divides the stream-K units and tiles, and for which the resulting
+    // number of K tiles per stream-K unit remains above min_iters_per_sk_unit_
+
+    uint32_t groups = platform::min(max_groups_problem, uint32_t(max_sk_groups_));
+
+    // Grouping is disabled when separate reduction is used
+    // if (do_separate_reduction) {
+    //   groups = 1;
+    // }
+
+    uint32_t fallback_groups = 0;
+
+    auto sk_splits_too_small = [&](uint32_t g) {
+      // Check whether the number of K tiles computed per stream-K unit is less
+      // than min_iters_per_sk_unit_
+      auto total_sk_tiles = sk_tiles / g;
+      auto total_sk_k_tiles = total_sk_tiles * k_tiles_per_output_tile;
+      auto k_tiles_per_sk_unit = total_sk_k_tiles / (sk_units / g);
+      return k_tiles_per_sk_unit < min_iters_per_sk_unit_;
+    };
+
+    auto is_ideal_grouping = [&](uint32_t g) {
+      // An ideal grouping will evenly divide stream-K clusters, evenly divide
+      // stream-K tiles, and not result in stream-K splits that are too small.
+      return (sk_units % g == 0) && (sk_tiles % g == 0) && !sk_splits_too_small(g);
+    };
+
+    auto is_valid_grouping = [&](uint32_t g) {
+      // A grouping is valid, but not ideal, if it evenly divides the
+      // stream-K clusters and does not result in stream-K splits that are
+      // too small. Such a setting can be used as a fallback option in the
+      // case that an ideal grouping is not achievable
+      return sk_units % g == 0 && !sk_splits_too_small(g);
+    };
+
+    while (groups > 1 && !is_ideal_grouping(groups)) {
+      if (fallback_groups == 0 && is_valid_grouping(groups)) {
+        // Set fallback groups once in preference for a larger number of groups.
+        fallback_groups = groups;
+      }
+      --groups;
+    }
+
+    // If groups == 1, we did not find a group count that satisfies all criteria. If we have
+    // found a fallback group count, use this instead.
+    if (groups == 1 && fallback_groups > 0) {
+      groups = fallback_groups;
+    }
+
+    auto sk_units_per_group = sk_units / groups;
+
+    // sk_tiles is guaranteed to be divisible by cluster_size because it is calculated as:
+    //    sk_tiles = (waves <= 2) ? total_tiles : (sm_count + (total_tiles % sm_count))
+    // Both total_tiles and sm_count are multiples of cluster size due to padding added
+    // prior to kernel launch.
+    uint64_t sk_tiles_per_group = sk_tiles / groups;
+
+    // Groups that will process an extra stream-K tile cluster. These differ from "big_units," which
+    // are stream-K units within a group that process an extra K chunk.
+    uint64_t sk_big_groups = sk_tiles % groups;
+
+    uint64_t k_tiles_per_group = k_tiles_per_output_tile * sk_tiles_per_group;
+
+    // Number of k tiles computed per stream-K unit
+    uint64_t k_tiles_per_sk_unit = k_tiles_per_group / sk_units_per_group;
+
+    uint32_t reduction_units = 0;
+
+    // Use separate reduction when we have less than one wave of output tiles (dp_tiles == 0)
+    // and when each tile will be operated on by at least two stream-K units (sk_units > 2 * sk_tiles)
+    // if (do_separate_reduction) {
+    //   // Each reduction unit will reduce the partials of an epilogue subtile for
+    //   // a given output tile and compute the epilogue. Thus, there are as many reduction
+    //   // units as there are epilogue subtiles.
+    //   reduction_units = sk_tiles * epilogue_subtile;
+    // }
+    // else 
+    if (decomposition_mode == DecompositionMode::Heuristic && sk_tiles < sk_units && sk_units % sk_tiles == 0) {
+      // If the number of stream-K units is a multiple of the number of stream-K tiles, then
+      // the problem can leverage a basic split-K decomposition for the stream-K tiles.
+      // This case happens when separate reduction is disable.
+      uint32_t sk_splits = static_cast<uint32_t>(sk_units / sk_tiles);
+      set_params_basic(
+        problem_blocks_m,
+        problem_blocks_n,
+        problem_blocks_l,
+        sk_splits,
+        k_tiles_per_output_tile,
+        reduction_workspace,
+        reduction_mode
+      );
+      return;
+    }
+
+    divmod_batch_ = FastDivmodU64(problem_blocks_m * problem_blocks_n);
+    divmod_tiles_per_output_tile_ = FastDivmod(k_tiles_per_output_tile);
+    divmod_sk_groups_ = FastDivmodU64(static_cast<uint64_t>(groups));
+    divmod_sk_units_per_group_ = FastDivmodU64(static_cast<uint64_t>(sk_units / groups));
+
+    divmod_splits_ = FastDivmod(1);
+    units_per_problem_ = static_cast<uint32_t>(dp_units + sk_units);
+
+    // Assign big_units_ assuming that group count == 1. This is unused by stream-K
+    // when group count > 1.
+    big_units_ = static_cast<uint32_t>(k_tiles_per_group % k_tiles_per_sk_unit);
+
+    big_groups_ = static_cast<uint32_t>(sk_big_groups);
+    reduction_workspace_ = reduction_workspace;
+    sk_tiles_ = sk_tiles;
+    sk_units_ = static_cast<uint32_t>(sk_units);
+    divmod_k_tiles_per_sk_unit_ = FastDivmod(static_cast<uint32_t>(k_tiles_per_sk_unit));
+    divmod_k_tiles_per_sk_big_unit_ = FastDivmod(static_cast<uint32_t>(k_tiles_per_sk_unit + 1));
+    reduction_mode_ = reduction_mode;
+  }
+
+  // Get the number of CTA tiles in this problem. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_tiled_cta_shape_mnl(BatchedGemmCoord problem_shape, GemmCoord cta_shape) {
+    auto cta_m = (problem_shape.m() + cta_shape.m() - 1) / cta_shape.m();
+    auto cta_n = (problem_shape.n() + cta_shape.n() - 1) / cta_shape.n();
+
+    return {
+      static_cast<uint32_t>(cta_m), 
+      static_cast<uint32_t>(cta_n),
+      static_cast<uint32_t>(problem_shape.batch())
+    };
+  }
+
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_grid_shape(
+    dim3 problem_blocks,
+    KernelHardwareInfo hw_info
+  ) {
+    return dim3{problem_blocks.y, problem_blocks.x, 1};
+  }
+
+  // Returns the number of stream-K tiles that will be computed amongst `output_tiles` total
+  // output tiles on a device with `ctas_per_wave` CTAs in each wave.
+  static uint32_t
+  get_num_sk_tiles(
+    uint64_t output_tiles,
+    uint64_t ctas_per_wave,
+    uint32_t k_tiles_per_output_tile,
+    DecompositionMode decomposition_mode
+  ) {
+    uint32_t full_waves = static_cast<uint32_t>(output_tiles / ctas_per_wave);
+    uint32_t total_waves = static_cast<uint32_t>((output_tiles + ctas_per_wave - 1) / ctas_per_wave);
+
+    if (decomposition_mode == DecompositionMode::DataParallel ||
+        decomposition_mode == DecompositionMode::SplitK) {
+      return 0;
+    }
+
+    // If there is wave quantization, assign the first two waves worth of tiles to be
+    // covered by stream-K work and the remainder to be data-parallel. Since we know
+    // that full_waves == total_waves - 1 in this case, the number of data-parallel
+    // waves is simply full_waves-1 (unless full_waves == 0).
+    uint32_t dp_waves = full_waves > 1 ? full_waves - 1 : 0;
+    uint64_t dp_tiles = dp_waves * ctas_per_wave;
+    uint64_t sk_tiles = output_tiles - dp_tiles;
+
+    if (decomposition_mode == DecompositionMode::Heuristic) {
+      if (full_waves == total_waves || k_tiles_per_output_tile <= min_iters_per_sk_unit_) {
+        // All tiles will be data-parallel tiles if there is either no quantization
+        // or if there is no work to be split.
+        return 0;
+      }
+
+      //
+      // The final wave is not full. Perform some stream-K work.
+      //
+
+      // Rudimentary heuristic: prefer data-parallel decomposition if we have more than
+      // one wave and the tail wave is more than half full. This is subject to change.
+      uint64_t tail_tiles = output_tiles - (full_waves * ctas_per_wave);
+      if (2 * tail_tiles >= ctas_per_wave) {
+        return 0;
+      }
+    }
+
+    return static_cast<uint32_t>(sk_tiles);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static uint64_t
+  get_num_sk_units(uint64_t ctas_per_sk_wave, uint32_t sk_tiles, uint32_t k_tiles_per_output_tile) {
+    // If there are stream-K tiles to compute and a sufficiently large number of k iterations
+    // across them, they will be covered by a single wave of persistent threadblocks. Thus, there
+    // will be as many work units as there are threadblocks in a single wave.
+    //
+    // When the total k iterations across stream-K tiles is too small to justify distributing
+    // across an entire wave of blocks, we instead distribute the iterations over a smaller
+    // set of blocks.
+
+    // Calculate the number of stream-K units that would be needed if each stream-K unit
+    // computed the minimum allowable k iterations. Truncate this to be in units of clusters.
+
+    // Number of k iterations computed by the stream-K units as a whole
+    uint64_t k_tiles_sk_total = k_tiles_per_output_tile * sk_tiles;
+
+    // Calculate the number of stream-K units that would be needed if each stream-K unit
+    // computed the minimum allowable k iterations. Truncate this to be in units of clusters.
+    uint64_t min_sized_sk_units = (k_tiles_sk_total / min_iters_per_sk_unit_);
+
+    uint64_t sk_units = platform::min(ctas_per_sk_wave, min_sized_sk_units);
+    return sk_units;
+  }
+
+  // Calculates the size of the workspace needed for holding reduction barriers
+  CUTLASS_HOST_DEVICE
+  static size_t
+  get_barrier_workspace_size(uint64_t num_tiles, uint32_t barrier_bits) {
+    size_t workspace_bits = num_tiles * static_cast<size_t>(barrier_bits);
+    return round_up_to_l2_alignment(bits_to_bytes<size_t>(workspace_bits));
+  }
+
+  // Calculates the size of the workspace needed for holding partial outputs from splits
+  CUTLASS_HOST_DEVICE
+  static size_t
+  get_reduction_workspace_size(uint64_t num_tiles, GemmCoord tile_shape, uint32_t accumulator_bits, uint32_t num_accumulator_mtxs = 1) {
+    size_t output_tile_size = tile_shape.m() * tile_shape.n();
+    size_t workspace_bits = accumulator_bits * output_tile_size * num_tiles * num_accumulator_mtxs;
+    return round_up_to_l2_alignment(bits_to_bytes<size_t>(workspace_bits));
+  }
+
+  static void
+  get_workspace_component_sizes(
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord tile_shape,
+    size_t& barrier_workspace_size,
+    size_t& reduction_workspace_size,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    DecompositionMode decomposition_mode,
+    uint32_t barrier_bits,
+    uint32_t accumulator_bits) {
+
+    // Workspace is needed only for output tiles that will be split. Thus, we first determine the number
+    // of output tiles that will be split, and then calculate the workspace needed to cover these.
+    uint64_t output_tiles = problem_blocks.x * problem_blocks.y * problem_blocks.z;
+
+    if (decomposition_mode == DecompositionMode::DataParallel) {
+      barrier_workspace_size = 0;
+      reduction_workspace_size = 0;
+    }
+    else if (splits > 1 &&
+             (decomposition_mode == DecompositionMode::SplitK || decomposition_mode == DecompositionMode::Heuristic)) {
+      // Basic split-K variant requires workspace for all output tiles
+      barrier_workspace_size = get_barrier_workspace_size(output_tiles, barrier_bits);
+      reduction_workspace_size = get_reduction_workspace_size(output_tiles, tile_shape, accumulator_bits);
+    }
+    else {
+      KernelHardwareInfo new_hw_info;
+      new_hw_info.device_id = hw_info.device_id;
+      new_hw_info.sm_count = hw_info.sm_count;
+      if (new_hw_info.sm_count <= 0) {
+        CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+            "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+        new_hw_info.sm_count = KernelHardwareInfo::query_device_multiprocessor_count(new_hw_info.device_id);
+      }
+
+      dim3 grid = get_grid_shape(
+        problem_blocks,
+        new_hw_info
+      );
+      uint64_t ctas_per_wave = grid.x * grid.y;
+      uint32_t sk_tiles = get_num_sk_tiles(
+        output_tiles,
+        ctas_per_wave,
+        static_cast<uint32_t>(k_tiles_per_output_tile),
+        decomposition_mode
+      );
+      uint64_t ctas_per_sk_wave = ctas_per_wave;
+      uint64_t sk_units = get_num_sk_units(ctas_per_sk_wave, sk_tiles, k_tiles_per_output_tile);
+      uint64_t dp_tiles = output_tiles - sk_tiles;
+
+      uint64_t reduction_tiles = sk_tiles;
+
+      // Though separate reduction requires a larger reduction workspace, only one barrier
+      // is needed per output tile. Each peer will increment the barrier by one once the peer has
+      // written its accumulator to scratch space. The separate reduction unit will only begin
+      // performing the reduction when the barrier has reached the number of peers for the output tile.
+      barrier_workspace_size = get_barrier_workspace_size(sk_tiles, barrier_bits);
+      reduction_workspace_size = get_reduction_workspace_size(reduction_tiles, tile_shape, accumulator_bits);
+    }
+  }
+
+  // Get the amount of scratch workspace needed for the kernel. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  static size_t
+  get_workspace_size(
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    DecompositionMode decomposition_mode,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits) {
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, tile_shape);
+    uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
+
+    return get_workspace_size(
+      problem_blocks,
+      k_tiles_per_output_tile,
+      tile_shape,
+      hw_info,
+      splits,
+      decomposition_mode,
+      barrier_bits,
+      element_accumulator_bits
+    );
+  }
+
+  // Version of get_workspace_size that takes in as input the number of CTAs in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  static size_t
+  get_workspace_size(
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord tile_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    DecompositionMode decomposition_mode,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits) {
+
+    size_t barrier_workspace_size = 0;
+    size_t reduction_workspace_size = 0;
+
+    get_workspace_component_sizes(
+      problem_blocks,
+      k_tiles_per_output_tile,
+      tile_shape,
+      barrier_workspace_size,
+      reduction_workspace_size,
+      hw_info,
+      splits,
+      decomposition_mode,
+      barrier_bits,
+      element_accumulator_bits
+    );
+
+    return barrier_workspace_size + reduction_workspace_size;
+  }
+
+  // Initialize the workspace to be used for the kernel. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  static cutlass::Status
+  initialize_workspace(
+    void* workspace,
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    DecompositionMode decomposition_mode,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits) {
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, tile_shape);
+    uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
+
+    return initialize_workspace(
+      workspace,
+      problem_blocks,
+      k_tiles_per_output_tile,
+      tile_shape,
+      hw_info,
+      splits,
+      decomposition_mode,
+      barrier_bits,
+      element_accumulator_bits
+    );
+  }
+
+  // Version of initialize_workspace that takes in as input the number of CTAs in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  static cutlass::Status
+  initialize_workspace(
+    void* workspace,
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord tile_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    DecompositionMode decomposition_mode,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits) {
+
+      uint64_t barrier_workspace_size = 0;
+      uint64_t reduction_workspace_size = 0;
+
+      get_workspace_component_sizes(
+        problem_blocks,
+        k_tiles_per_output_tile,
+        tile_shape,
+        barrier_workspace_size,
+        reduction_workspace_size,
+        hw_info,
+        splits,
+        decomposition_mode,
+        barrier_bits,
+        element_accumulator_bits
+      );
+
+      if (barrier_workspace_size > 0) {
+        if (workspace == nullptr) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        // Only the barrier workspace needs to be cleared for stream-K.
+        // Barrier workspace follows reduction workspace.
+        uint8_t* barrier_workspace = reinterpret_cast<uint8_t*>(workspace) + reduction_workspace_size;
+        return zero_workspace(static_cast<void*>(barrier_workspace), barrier_workspace_size);
+      }
+
+    return Status::kSuccess;
+  }
+
+  void
+  set_params_basic(
+    uint32_t blocks_m,
+    uint32_t blocks_n,
+    uint32_t blocks_l,
+    uint32_t splits,
+    uint32_t k_tiles_per_output_tile,
+    void* reduction_workspace,
+    ReductionMode reduction_mode) {
+
+    divmod_batch_ = FastDivmodU64(blocks_m * blocks_n);
+    divmod_tiles_per_output_tile_ = FastDivmod(k_tiles_per_output_tile);
+    divmod_sk_groups_ = FastDivmodU64(1u);
+    divmod_splits_ = FastDivmod(splits);
+    units_per_problem_ = blocks_m * blocks_n * blocks_l;
+    big_units_ = k_tiles_per_output_tile % splits;
+    reduction_workspace_ = reduction_workspace;
+    reduction_mode_ = reduction_mode;
+    divmod_k_tiles_per_sk_unit_ = FastDivmod(k_tiles_per_output_tile / splits);
+    divmod_k_tiles_per_sk_big_unit_ = FastDivmod(k_tiles_per_output_tile / splits + 1);
+
+    // No stream-K work is performed for "basic" data-parallel and split-K decompositions
+    sk_tiles_ = 0;
+    sk_units_ = 0;
+    divmod_sk_units_per_group_ = FastDivmodU64(1u);
+  }
+
+  private:
+  // Round up number of bytes to the nearest multiple of L2 cache line alignment
+  CUTLASS_HOST_DEVICE
+  static size_t
+  round_up_to_l2_alignment(size_t bytes) {
+    constexpr size_t L2CacheLineSizeBytes = 128u;
+    return (bytes + L2CacheLineSizeBytes - 1) / L2CacheLineSizeBytes * L2CacheLineSizeBytes;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+} // namespace detail
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
new file mode 100644
index 0000000000..b32c65296e
--- /dev/null
+++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
@@ -0,0 +1,857 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/barrier.h"
+#include "cutlass/block_striped.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/layout.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp"
+
+namespace cutlass::gemm::kernel::detail {
+
+// Persistent Thread Block (TB) scheduler leveraging stream-K decomposition
+template <
+  class TileShape
+>
+class PersistentTileSchedulerIntelPVCStreamK {
+  //
+  // Data members
+  //
+
+private:
+  uint64_t current_work_linear_idx_ = 0;
+
+public:
+
+  // Use a dummy barrier manager to simply get the type used to store the barrier
+  using BarrierType = typename NamedBarrierManager<1>::T;
+
+  using Params = PersistentTileSchedulerIntelPVCStreamKParams;
+  using ReductionMode = Params::ReductionMode;
+  using DecompositionMode = Params::DecompositionMode;
+
+  struct WorkTileInfo {
+    int32_t M_idx = 0;
+    int32_t N_idx = 0;
+    int32_t K_idx = 0;
+    int32_t L_idx = 0;
+
+    // Number of k tiles to compute for this unit of work. For stream-K, this
+    // can indicate the number of K tiles across multiple output tiles.
+    uint32_t k_tile_count = 0;
+
+    // Number of k tiles remaining for the work unit as a whole
+    uint32_t k_tile_remaining = 0;
+
+    // Whether this unit of work is the final split for the given tile
+    bool is_separate_reduction = false;
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_valid() const {
+      // A work tile that computes no K tiles is invalid unless it is a separate-reduction work tile
+      // (which only performs reduction and epilogue)
+      return k_tile_count > 0 || is_separate_reduction;
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_reduction_unit() const {
+      return is_separate_reduction;
+    }
+
+    CUTLASS_HOST_DEVICE
+    int32_t
+    reduction_subtile_idx() const {
+      // For separate reduction units, the K_idx of the work tile is unused.
+      // Therefore, we override it to contain the subtile of that the reduction
+      // unit operates on.
+      return is_reduction_unit() ? K_idx : -1;
+    }
+
+    CUTLASS_HOST_DEVICE
+    static WorkTileInfo
+    invalid_work_tile() {
+      return {-1, -1, -1, -1, 0};
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_final_split(uint32_t k_tiles_per_output_tile) const {
+      return (K_idx + k_tile_count) == k_tiles_per_output_tile;
+    }
+  };
+
+  struct Arguments {
+
+    Arguments() = default;
+    Arguments(Arguments const&) = default;
+    Arguments(Arguments&&) = default;
+
+    CUTLASS_HOST_DEVICE
+    Arguments&
+    operator=(Arguments const& args) {
+      splits = args.splits;
+      reduction_mode = args.reduction_mode;
+      decomposition_mode = args.decomposition_mode;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    Arguments&
+    operator=(Arguments&& args) noexcept {
+      splits = args.splits;
+      reduction_mode = args.reduction_mode;
+      decomposition_mode = args.decomposition_mode;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(int splits_) : splits(splits_) {}
+
+    CUTLASS_HOST_DEVICE
+    Arguments(int splits_, DecompositionMode decomposition_mode_) :
+      splits(splits_),
+      decomposition_mode(decomposition_mode_) {}
+
+    // The splitting factor to be used in a split-K decomposition of the problem.
+    // If this is set to a value greater than 1, stream-K decomposition logic
+    // is bypassed in favor of a split-K decomposition.
+    int splits = 1;
+    ReductionMode reduction_mode = ReductionMode::Deterministic;
+    DecompositionMode decomposition_mode = DecompositionMode::Heuristic;
+  };
+
+  // Sink scheduler params as a member
+  Params scheduler_params;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static Params
+  to_underlying_arguments(
+    ProblemShape problem_shape,
+    TileShape tile_shape,
+    KernelHardwareInfo const& hw_info,
+    Arguments const& args,
+    void* workspace) {
+
+    static_assert(cute::is_static<TileShape>::value);
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, cute::Int<1>{});
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape);
+    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
+
+    Params params;
+    params.initialize(
+      problem_blocks,
+      k_tile_per_output_tile,
+      hw_info,
+      args.splits,
+      args.reduction_mode,
+      args.decomposition_mode,
+      workspace
+    );
+    return params;
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    // Split count > 1 is only valid for heuristic and split-K decomposition modes
+    return (args.splits == 1 ||
+            args.decomposition_mode == DecompositionMode::Heuristic ||
+            args.decomposition_mode == DecompositionMode::SplitK);
+  }
+
+  CUTLASS_HOST_DEVICE
+  PersistentTileSchedulerIntelPVCStreamK() { };
+
+  CUTLASS_HOST_DEVICE
+  PersistentTileSchedulerIntelPVCStreamK(Params const& params_) : scheduler_params(params_) {
+    current_work_linear_idx_ = uint64_t(BlockIdxX()) + uint64_t(BlockIdxY()) * uint64_t(GridDimX());
+  }
+
+  CUTLASS_DEVICE
+  WorkTileInfo
+  get_current_work() const {
+    return get_current_work_for_linear_idx(current_work_linear_idx_, scheduler_params);
+  }
+
+  CUTLASS_DEVICE
+  static WorkTileInfo
+  get_current_work_for_linear_idx(uint64_t linear_idx, Params const& params) {
+    // The maximum number of work units is units_per_problem_ * splits_.
+    // The multiplication by splits_ is used for handling split-K, in which
+    // units_per_problem_ is equal to the total number of output tiles. To account
+    // for the fact that we have splits_ peers per output tile, we multiply this
+    // value by splits_. For stream-K, this multiplication ends up being a no-op
+    // because splits_ is set to 1 for stream-K.
+    if(linear_idx >= (params.units_per_problem_ * params.divmod_splits_.divisor)) {
+      // Invalid work. Return an empty result.
+      return WorkTileInfo::invalid_work_tile();
+    }
+
+    WorkTileInfo work_tile_info;
+    assign_work(params, linear_idx, work_tile_info);
+    return work_tile_info;
+  }
+
+  // Returns whether the current work_tile_info passed in should continue to be used. This
+  // occurs only in the stream-K decomposition with stream-K work units, which encompass
+  // work over multiple output tiles. If the current work_tile_info should continue to be
+  // used, it is updated to advance to the next output tile it should cover.
+  CUTLASS_DEVICE
+  bool
+  continue_current_work(WorkTileInfo& work_tile_info) const {
+    return continue_current_work_for_linear_idx(
+      current_work_linear_idx_, work_tile_info, scheduler_params);
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  continue_current_work_for_linear_idx(
+    uint64_t linear_idx,
+    WorkTileInfo& work_tile_info,
+    Params const& params) {
+
+    work_tile_info.k_tile_remaining -= work_tile_info.k_tile_count;
+
+    if (work_tile_info.k_tile_remaining == 0) {
+      return false;
+    }
+    assign_work(params, linear_idx, work_tile_info);
+    return work_tile_info.is_valid();
+  }
+
+  CUTLASS_DEVICE
+  void
+  advance_to_next_work(uint32_t advance_count = 1) {
+    current_work_linear_idx_ += uint64_t(GridDimX()) * uint64_t(GridDimY()) * uint64_t(GridDimZ()) * uint64_t(advance_count);
+  }
+
+  // Given the inputs, computes the total number of output blocks this problem will compute over
+  // Note that this is only the logical size of our grid, not the physical grid we will actually launch.
+  template <class ProblemShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_tiled_cta_shape_mnl(ProblemShape problem_shape_mnkl, TileShape cta_shape) {
+    return Params::get_tiled_cta_shape_mnl(to_gemm_coord(problem_shape_mnkl), to_gemm_coord(cta_shape));
+  }
+
+  // Given the cluster shape, computes the physical grid we should launch.
+  template <class ProblemShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+    ProblemShape problem_shape,
+    TileShape tile_shape,
+    KernelHardwareInfo hw_info) {
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, cute::Int<1>{});
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape);
+
+    return Params::get_grid_shape(
+      problem_blocks,
+      hw_info
+    );
+  }
+
+  // Returns whether fixup is needed for `work_tile_info`.
+  CUTLASS_HOST_DEVICE
+  static bool
+  requires_fixup(Params const& params, WorkTileInfo const& work_tile_info) {
+    // Fixup is not needed for invalid or data-parallel tiles
+    return work_tile_info.is_valid() && work_tile_info.k_tile_count != params.divmod_tiles_per_output_tile_.divisor;
+  }
+
+  // Performs the reduction across splits for a given output tile.
+/*template <class FrgTensorC>
+  CUTLASS_DEVICE
+  static void
+  fixup(
+    Params const& params,
+    WorkTileInfo const& work_tile_info,
+    FrgTensorC& accumulators,
+    uint32_t num_barriers,
+    uint32_t barrier_idx) {
+    static constexpr uint32_t Offset = static_cast<int>(cutlass::arch::ReservedNamedBarriers::StreamkBarrier0);
+    static constexpr uint32_t MaxNumNamedBarriers = 2;
+    using BarrierManager = NamedBarrierManager<NumThreadsPerWarpGroup, Offset, MaxNumNamedBarriers>;
+    return fixup_helper<FrgTensorC, BarrierManager>(
+      params, work_tile_info, accumulators, num_barriers, barrier_idx);
+  }
+
+  // Helper for performing the reduction across splits for a given output tile.
+  template <class FrgTensorC, class BarrierManager>
+  CUTLASS_DEVICE
+  static void
+  fixup_helper(
+    Params const& params,
+    WorkTileInfo const& work_tile_info,
+    FrgTensorC& accumulators,
+    uint32_t num_barriers,
+    uint32_t barrier_idx,
+    uint32_t num_accumulator_mtxs = 1) {
+
+    using ElementAccumulator = typename FrgTensorC::value_type;
+
+    if (!requires_fixup(params, work_tile_info)) {
+      return;
+    }
+    auto tile_idx = output_tile_index(params, work_tile_info);
+
+    // Index of the lock on which to wait
+    auto lock_idx = (tile_idx * num_barriers) + barrier_idx;
+
+    auto reduction_tile_idx = tile_idx;
+    auto [first_peer_id, my_peer_id, last_peer_id] = tile_peer_range(params, tile_idx, static_cast<uint32_t>(work_tile_info.K_idx));
+    auto reduction_peer_offset = 0;
+    if (params.requires_separate_reduction()) {
+      // If separate reduction is to be performed, each stream-K unit writes its partials
+      // to a separate portion of the workspace. There are as many of these portions as there
+      // are peers for a given output tile, so we multiply the tile index by the maximum peer count.
+      reduction_tile_idx *= Params::max_peers_per_tile(params.sk_units_, params.sk_tiles_);
+      reduction_peer_offset = my_peer_id * cute::size<0>(TileShape{}) * cute::size<1>(TileShape{});
+    }
+
+    // Reductions use BlockStripedReduce with a width of BarrierManager::ThreadCount under the hood.
+    // Thus, the start of the reduction space is the same across all threads in a warp group.
+    int reduction_offset =
+      (cute::size<0>(TileShape{}) * cute::size<1>(TileShape{}) * reduction_tile_idx * num_accumulator_mtxs) +
+      reduction_peer_offset +
+      (size(accumulators) * barrier_idx * BarrierManager::ThreadCount);
+
+    ElementAccumulator* group_reduction_workspace = reinterpret_cast<ElementAccumulator*>(params.reduction_workspace_) + reduction_offset;
+
+    using AccumulatorArrayT = Array<typename FrgTensorC::value_type, size(FrgTensorC{})>;
+    using BlockStripedReduceT = BlockStripedReduce<BarrierManager::ThreadCount, AccumulatorArrayT>;
+
+    AccumulatorArrayT* reduction_workspace_array = reinterpret_cast<AccumulatorArrayT*>(group_reduction_workspace);
+    AccumulatorArrayT* accumulator_array = reinterpret_cast<AccumulatorArrayT*>(accumulators.data());
+
+    int barrier_group_thread_idx = ThreadIdxX() % BarrierManager::ThreadCount;
+
+    // The number of tiles for which reduction is required is either:
+    //   (a) the total number of output tiles (in the case of split-K)
+    //   (b) the number of stream-K tiles (potentially multiplied by peer count if using separate reduction)
+    // To calculate the total number of output tiles in the split-K case, we
+    // note that, in the split-K case, the units_per_problem_ member of Params will be
+    // the total number of output tiles.
+    uint32_t reduction_tiles = 0;
+    if (params.divmod_splits_.divisor > 1) {
+      reduction_tiles = params.units_per_problem_;
+    }
+    else if (params.requires_separate_reduction()) {
+      reduction_tiles = params.sk_tiles_ * Params::max_peers_per_tile(params.sk_units_, params.sk_tiles_);
+    }
+    else {
+      reduction_tiles = params.sk_tiles_;
+    }
+
+    auto reduction_workspace_size = Params::get_reduction_workspace_size(
+      reduction_tiles, to_gemm_coord(TileShape{}), sizeof_bits<ElementAccumulator>::value, num_accumulator_mtxs);
+    BarrierType* lock_workspace = reinterpret_cast<BarrierType*>(
+      reinterpret_cast<uint8_t*>(params.reduction_workspace_) + reduction_workspace_size);
+
+    if (work_tile_info.is_reduction_unit()) {
+      plus<AccumulatorArrayT> add_fragments;
+      auto peer_offset = size(accumulators) * num_barriers * BarrierManager::ThreadCount;
+
+      // Wait until the peers collaborating on this output tile have all written
+      // their accumulators to workspace.
+      uint32_t num_peers = last_peer_id - first_peer_id + 1;
+      BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, num_peers);
+
+      // Load the first peer's data
+      BlockStripedReduceT::load(*accumulator_array, reduction_workspace_array, barrier_group_thread_idx);
+
+      for (int i = 1; i < num_peers; ++i) {
+        // Load peer fragment
+        AccumulatorArrayT addend_fragment;
+        auto peer_reduction_workspace = reinterpret_cast<AccumulatorArrayT*>(group_reduction_workspace + (i * peer_offset));
+
+        BlockStripedReduceT::load(addend_fragment, peer_reduction_workspace, barrier_group_thread_idx);
+
+        // Add peer fragment
+        *accumulator_array = add_fragments(*accumulator_array, addend_fragment);
+      }
+    }
+    else if (!compute_epilogue(work_tile_info, params)) {
+      if (params.requires_separate_reduction() || work_tile_info.K_idx == 0) {
+        // The first peer initializes the workspace partials in the non-separate-reduction case,
+        // and all peers write to their own location in workspace when using separate reduction
+        BlockStripedReduceT::store(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx);
+      }
+      else {
+        // Wait until the preceding split added its accumulators
+        BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx);
+
+        // Perform reduction in workspace
+        BlockStripedReduceT::reduce(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx);
+      }
+
+      // If separate reduction is being performed, each participating stream-K unit increments the barrier
+      // by only 1. Otherwise, increment by the K tile count that this unit has processed.
+      int32_t increment = params.requires_separate_reduction() ? 1 : work_tile_info.k_tile_count;
+
+      // Signal our arrival
+      BarrierManager::arrive_inc(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, increment);
+    }
+    else {
+      if (params.reduction_mode_ == ReductionMode::Deterministic) {
+        // Wait until the preceding split added its accumulators
+        BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx);
+      }
+      else {
+        // Wait unitl the first split has stored its accumulators
+        BarrierManager::wait_lt(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, 1);
+      }
+
+      // The block computing the final split for the tile adds previously-reduced partials
+      // to its accumulators and computes the epilogue.
+      BlockStripedReduceT::load_add(*accumulator_array, reduction_workspace_array, barrier_group_thread_idx);
+    }
+  }
+
+  // Returns whether the block assigned this work should compute the epilogue for the corresponding
+  // output tile. For the case of stream-K, this should only occur if the work is marked as the final split.
+  CUTLASS_HOST_DEVICE
+  static bool
+  compute_epilogue(WorkTileInfo const& work_tile_info, Params const& params) {
+    // `is_final_split` will be set to `true` for the following scenarios, all of which must compute the epilogue:
+    //  1. The tile is computed in data-parallel mode
+    //  2. The tile is computed in split-/stream-K mode and this work unit represents the final split of the tile
+    //  3. The tile is computed in split-/stream-K mode and separate reduction is used, and this is a separate reduction unit
+    return work_tile_info.is_valid() &&
+            (work_tile_info.is_final_split(params.divmod_tiles_per_output_tile_.divisor) &&
+             !params.requires_separate_reduction()) || work_tile_info.is_separate_reduction;
+  }
+
+  // Returns the linearized index of the output tile corresponding to the tile with offset [L, M, K]
+  CUTLASS_DEVICE
+  static int
+  output_tile_index(Params const& params, WorkTileInfo const& work_tile_info) {
+    uint64_t linear_idx_in_batch = UnderlyingScheduler::get_linear_idx_from_m_and_n(
+      work_tile_info.M_idx, work_tile_info.N_idx,
+      params.divmod_cluster_shape_major_,
+      params.divmod_cluster_shape_minor_,
+      params.divmod_cluster_blk_major_,
+      params.log_swizzle_size_,
+      params.raster_order_
+    );
+
+    uint64_t tiles_mn = params.divmod_batch_.divisor;
+    return tiles_mn * work_tile_info.L_idx + linear_idx_in_batch;
+  }*/
+
+  template <class ProblemShape, class ElementAccumulator>
+  static size_t
+  get_workspace_size(
+    Arguments const& args,
+    ProblemShape problem_shape,
+    KernelHardwareInfo const& hw_info) {
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, 1);
+
+    TileShape tile_shape;
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape);
+    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
+
+    return Params::get_workspace_size(
+      problem_blocks,
+      k_tile_per_output_tile,
+      to_gemm_coord(tile_shape),
+      hw_info,
+      args.splits,
+      args.decomposition_mode,
+      sizeof_bits<BarrierType>::value,
+      sizeof_bits<ElementAccumulator>::value
+    );
+  }
+
+  template <class ProblemShape, class ElementAccumulator>
+  static cutlass::Status
+  initialize_workspace(
+    Arguments const& args,
+    void* workspace,
+    ProblemShape const& problem_shape,
+    KernelHardwareInfo const& hw_info) {
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, 1);
+
+    TileShape tile_shape;
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape);
+    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
+
+    return Params::initialize_workspace(
+      workspace,
+      problem_blocks,
+      k_tile_per_output_tile,
+      to_gemm_coord(tile_shape),
+      hw_info,
+      args.splits,
+      args.decomposition_mode,
+      sizeof_bits<BarrierType>::value,
+      sizeof_bits<ElementAccumulator>::value
+    );
+  }
+
+  template <class ProblemShape>
+  CUTLASS_HOST_DEVICE
+  static int
+  get_work_k_tile_count(WorkTileInfo const& work_tile_info, ProblemShape, TileShape) {
+    return work_tile_info.k_tile_count;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static uint32_t
+  get_work_k_tile_start(WorkTileInfo const& work_tile_info) {
+    return work_tile_info.K_idx;
+  }
+
+  // Kernel helper function to get next work tile
+  CUTLASS_DEVICE
+  auto
+  fetch_next_work(WorkTileInfo work_tile_info) {
+    if (continue_current_work(work_tile_info)) {
+      return work_tile_info;
+    }
+
+    advance_to_next_work();
+    return get_current_work();
+  }
+
+  // Returns the initial work tile info that will be computed over
+  CUTLASS_DEVICE
+  WorkTileInfo
+  initial_work_tile_info() {
+    return get_current_work();
+  }
+
+private:
+  // Sets the current stream-K work to compute within work_tile_info. If new_unit is true, work_tile_info
+  // is populated as a new unit of work. Otherwise, state existing in work_tile_info (e.g., remaining
+  // iterations) is used to find the next tile in the current work unit.
+/*CUTLASS_DEVICE
+  static void
+  assign_work(
+    Params const& params,
+    uint64_t linear_idx,
+    WorkTileInfo& work_tile_info) {
+
+    auto [cta_m_in_cluster_, cta_n_in_cluster_, _] = cute::block_id_in_cluster();
+    uint64_t cta_m_in_cluster = static_cast<uint64_t>(cta_m_in_cluster_);
+    uint64_t cta_n_in_cluster = static_cast<uint64_t>(cta_n_in_cluster_);
+    uint64_t output_tile_id = linear_idx;
+    if (linear_idx >= params.units_per_problem_ * params.divmod_splits_.divisor) {
+      // Separate-reduction work
+      auto cluster_size = params.get_cluster_size();
+      // Divide up the linearized separate reduction units into clusters
+      auto cluster_linear_reduction_unit_idx = params.div_cluster_size((linear_idx - params.units_per_problem_));
+      uint64_t cluster_tile_idx, epi_subtile_idx;
+      params.divmod_epilogue_subtile_(cluster_tile_idx, epi_subtile_idx, cluster_linear_reduction_unit_idx);
+      // Bring the linearized tile ID back into the space of tiles, rather than clusters
+      output_tile_id = cluster_tile_idx * cluster_size;
+
+      work_tile_info.setup_separate_reduction(epi_subtile_idx);
+    }
+    else if (linear_idx >= params.sk_units_ && params.divmod_splits_.divisor == 1) {
+      // Data-parallel work
+      output_tile_id = linear_idx - params.sk_units_ + params.sk_tiles_;
+      work_tile_info.K_idx = 0;
+      work_tile_info.k_tile_count = params.divmod_tiles_per_output_tile_.divisor;
+      work_tile_info.k_tile_remaining = params.divmod_tiles_per_output_tile_.divisor;
+    }
+    else {
+      // In the CUTLASS 2.x implementation of stream K, stream-K work is assigned to each stream-K
+      // threadblock individually. For the most part, the set of K iterations corresponding to stream-K
+      // work was divided amongst stream-K threadblocks, and a threadblock determined which tile
+      // it would compute a (potentially-partial) output tile for based on the space of k iterations
+      // assigned to it. This often results in stream-K threadblocks processing tiles with different
+      // offsets in the K dimension from one another. This can reduce locality, but is lmitied to the
+      // (generally few) waves of threadblocks assigned to compute stream-K work.
+      //
+      // With the introduction of threadblock clusters, there is additional benefit to maintaining
+      // locality in the K dimension: shared portions of operands can be multicasted to threadblocks
+      // within a cluster. Thus, we would like to ensure that the assignment of stream-K work to
+      // threadblocks respects the ability to perform multicasting.
+      //
+      // To do so, we divide up the linearized stream-K units into clusters and share the same K
+      // offsets for work within clusters.
+
+      auto cluster_linear_work_idx = params.div_cluster_size(linear_idx);
+
+      uint64_t group_idx;
+      params.divmod_sk_groups_(cluster_linear_work_idx, group_idx, cluster_linear_work_idx);
+
+      // Determine whether we are in a "big group" that will process an additional
+      // stream-K cluster tile.
+      auto sk_cluster_tiles = params.div_cluster_size(params.sk_tiles_);
+      auto sk_cluster_tiles_in_group = params.divmod_sk_groups_.divide(sk_cluster_tiles);
+      if (group_idx < params.big_groups_) {
+        ++sk_cluster_tiles_in_group;
+      }
+
+      // Determine whether we are in a "big unit" within the group, that will process
+      // an additional K chunk in the group.
+      auto sk_tiles_in_group = sk_cluster_tiles_in_group * params.get_cluster_size();
+      auto k_tiles_in_group = sk_tiles_in_group * params.divmod_tiles_per_output_tile_.divisor;
+      auto k_tiles_per_unit_in_group = params.divmod_sk_units_per_group_.divide(k_tiles_in_group);
+      auto big_units_in_group = params.div_cluster_size(
+        k_tiles_in_group - (k_tiles_per_unit_in_group * params.divmod_sk_units_per_group_.divisor));
+
+      uint64_t split;
+      params.divmod_clusters_mnl_(split, cluster_linear_work_idx, cluster_linear_work_idx);
+
+      bool is_split_k = params.divmod_splits_.divisor > 1;
+      auto big_unit_cmp_lhs = is_split_k ? split : cluster_linear_work_idx;
+      auto big_unit_cmp_rhs = is_split_k ? params.big_units_ : big_units_in_group;
+      auto linear_idx_mult = is_split_k ? params.divmod_tiles_per_output_tile_.divisor : k_tiles_per_unit_in_group;
+      auto k_tiles_per_split = is_split_k ? params.divmod_k_tiles_per_sk_unit_.divisor : k_tiles_per_unit_in_group;
+
+      // Determine the starting k iteration computed by this stream-K work unit
+      uint32_t unit_iter_start = (linear_idx_mult * cluster_linear_work_idx) +
+                                 (k_tiles_per_split * split);
+
+      // Adjust the starting position and number of k iterations for "big units," which
+      // compute one extra iteration. If there are any big units, they will be the first
+      // in the linearized ID space.
+      auto k_tiles_in_my_split = k_tiles_per_split;
+      if (big_unit_cmp_lhs < big_unit_cmp_rhs) {
+        // Since the "big units" are the first units in the linearized ID space, each
+        // of the units preceding this big unit computed one extra iteration. Thus,
+        // we must offset our start iteration by the number of units that precede
+        // the current unit in the linearized ID space.
+        unit_iter_start += big_unit_cmp_lhs;
+        ++k_tiles_in_my_split;
+      }
+      else {
+        // Increment by one for each of the big clusters (since all big units precede this unit)
+        unit_iter_start += big_unit_cmp_rhs;
+      }
+
+      if (!is_split_k) {
+        // Adjust the unit starting position and number of tiles to avoid
+        // computing splits of size less than min_iters_per_sk_unit_
+        int unused, start_tile_k_tile;
+        params.divmod_tiles_per_output_tile_(unused, start_tile_k_tile, unit_iter_start);
+        if (start_tile_k_tile < Params::min_iters_per_sk_unit_) {
+          // Starting K tile is in range [0, Params::min_iters_per_sk_unit_), which means that another
+          // stream-K unit will be computing a split with fewer than Params::min_iters_per_sk_unit_ K tiles.
+          // Adjust our work to take over these K tiles.
+          unit_iter_start -= start_tile_k_tile;
+          k_tiles_in_my_split += start_tile_k_tile;
+        }
+        else if (start_tile_k_tile > (params.divmod_tiles_per_output_tile_.divisor - Params::min_iters_per_sk_unit_)) {
+          // Starting K tile is within the final Params::min_iters_per_sk_unit_ K tiles of some output tile,
+          // which means that this unit will compute a split with fewer than Params::min_iters_per_sk_unit_ K tiles.
+          // Adjust our work to shed these K tiles to a neighboring stream-K unit that will compute more consecutive K tiles.
+          auto adjustment_tiles = (params.divmod_tiles_per_output_tile_.divisor - start_tile_k_tile);
+          unit_iter_start += adjustment_tiles;
+          k_tiles_in_my_split -= adjustment_tiles;
+        }
+        else if (params.ktile_start_alignment_count == 2 && start_tile_k_tile % 2 != 0) {
+          // ktile for each SM start from even number
+          // If start from odd number ktile within the output tile
+          //    now start at the ktile one before my initial ktile start (take one ktile from prev sm)
+          // if end on odd number ktile within the output tile
+          //    now end at ktile that one before my ktile end (give one ktile to next sm)
+          unit_iter_start -= 1;
+          k_tiles_in_my_split += 1;
+        }
+      }
+
+      if (work_tile_info.k_tile_count == 0) {
+        // This is a new unit
+
+        if (!is_split_k) {
+          //
+          // Adjust the unit ending position and number of tiles to avoid
+          // computing splits of size less than min_iters_per_sk_unit_
+          //
+
+          // Begin by assuming that no adjustment is needed
+          auto initial_unit_iter_end = unit_iter_start + k_tiles_in_my_split;
+
+          int unused, end_tile_k_tile;
+          params.divmod_tiles_per_output_tile_(unused, end_tile_k_tile, initial_unit_iter_end);
+
+          if (end_tile_k_tile < Params::min_iters_per_sk_unit_) {
+            // Ending K tile is within the first Params::min_iters_per_sk_unit_ K tiles of some output tile,
+            // which means that this unit will compute a split with fewer than Params::min_iters_per_sk_unit_ K tiles.
+            // Adjust our work to shed these K tiles to a neighboring stream-K unit that will compute more consecutive K tiles.
+            k_tiles_in_my_split -= end_tile_k_tile;
+          }
+          else if (end_tile_k_tile > (params.divmod_tiles_per_output_tile_.divisor - Params::min_iters_per_sk_unit_)) {
+            // Ending K tile is within the final Params::min_iters_per_sk_unit_ K tiles of some output tile,
+            // which means that some other unit will compute a split with fewer than Params::min_iters_per_sk_unit_ K tiles.
+            // Adjust our work to take on these K tiles.
+            k_tiles_in_my_split += (params.divmod_tiles_per_output_tile_.divisor - end_tile_k_tile);
+          }
+          else if (params.ktile_start_alignment_count == 2 && end_tile_k_tile % 2 != 0) {
+            // ktile for each SM start from even number
+            // If start from odd number ktile within the output tile
+            //    now start at the ktile one before my initial ktile start (take one ktile from prev sm)
+            // If end on odd number ktile within the output tile,
+            //    now end at ktile that one before my ktile end (give one ktile to next sm)
+            k_tiles_in_my_split -= 1;
+          }
+        }
+
+        work_tile_info.k_tile_remaining = k_tiles_in_my_split;
+      }
+
+      uint32_t unit_iter_end = unit_iter_start + work_tile_info.k_tile_remaining - 1;
+
+      // Find the output tile corresponding to the final k tile covered by this
+      // work unit. Stream-K work units will work backwards in terms of the tiles they
+      // are responsible computing. This is beneficial because the final (partial)
+      // tile computed by a stream-K block is typically the beginning of the output
+      // tile, while the beginning (partial) tile is typically the ending of another
+      // output tile. Since ending portions of an output tile must reduce across
+      // other work units computing portions of that output tile, it is preferable
+      // for them to be computed later, so as to reduce the likelihood of blocking
+      // on other work.
+
+      auto output_tile_id_in_group = params.divmod_tiles_per_output_tile_.divide(unit_iter_end);
+      uint32_t output_tile_iter_start = output_tile_id_in_group * params.divmod_tiles_per_output_tile_.divisor;
+      uint32_t output_tile_iter_end = output_tile_iter_start + params.divmod_tiles_per_output_tile_.divisor;
+
+      // Convert the output tile from the linearized space within each group to the
+      // overall linearized space.
+      output_tile_id = (output_tile_id_in_group * params.divmod_sk_groups_.divisor) + group_idx;
+
+      // Bring the linearized tile ID back into the space of tiles, rather than clusters
+      output_tile_id *= params.get_cluster_size();
+
+      // The final linearized tile ID is in units of the cluster dimension over which we rasterize.
+      if (params.raster_order_ == RasterOrder::AlongN) {
+        output_tile_id += cta_n_in_cluster * params.divmod_cluster_shape_minor_.divisor;
+      }
+      else {
+        output_tile_id += cta_m_in_cluster * params.divmod_cluster_shape_minor_.divisor;
+      }
+
+      // The unit's starting k iteration in the current tile is either the starting
+      // iteration for the tile as a whole, or the starting k iteration for the unit
+      // as a whole (if the latter is greater than the former).
+      uint32_t tile_iter_start = max(output_tile_iter_start, unit_iter_start);
+
+      // Similarly, the unit's ending k iteration (exclusive) is either the end of
+      // the current tile it is assigned, or the ending iteration of the unit as a whole
+      // (if the latter is less than the former).
+      uint32_t tile_iter_end = min(output_tile_iter_end, unit_iter_end + 1);
+
+      // Set the k offset to be the starting k tile for this output tile
+      work_tile_info.K_idx = static_cast<int32_t>(tile_iter_start - output_tile_iter_start);
+      work_tile_info.k_tile_count = tile_iter_end - tile_iter_start;
+    }
+
+    uint64_t work_idx_l, remainder;
+    params.divmod_batch_(work_idx_l, remainder, output_tile_id);
+
+    uint64_t cta_per_grid_dim = params.divmod_cluster_shape_minor_.divide(remainder);
+
+    auto [work_idx_m, work_idx_n] = UnderlyingScheduler::get_work_idx_m_and_n(
+                                          cta_per_grid_dim,
+                                          params.divmod_cluster_shape_major_,
+                                          params.divmod_cluster_shape_minor_,
+                                          params.divmod_cluster_blk_major_,
+                                          params.log_swizzle_size_,
+                                          params.raster_order_
+                                        );
+
+    // Set the M, N, and L block offsets
+    work_tile_info.M_idx = work_idx_m;
+    work_tile_info.N_idx = work_idx_n;
+    work_tile_info.L_idx = static_cast<int32_t>(work_idx_l);
+  }
+
+  // Returns the starting and ending peer ID of this tile
+  CUTLASS_HOST_DEVICE
+  static auto
+  tile_peer_range(Params const& params, uint32_t tile_idx, uint32_t cur_k_tile) {
+    auto tile_idx_in_cluster_path = params.div_cluster_size(tile_idx);
+    auto start_k_tile = params.divmod_tiles_per_output_tile_.divisor * tile_idx_in_cluster_path;
+    auto end_k_tile = start_k_tile + params.divmod_tiles_per_output_tile_.divisor - 1;
+    auto big_unit_k_tiles = params.big_units_ * (params.divmod_k_tiles_per_sk_unit_.divisor + 1);
+
+    auto adjust_unit = [&](uint32_t k_tile, uint32_t unit_idx, uint32_t k_tiles_per_unit) {
+      auto unit_k_start = unit_idx * k_tiles_per_unit;
+      auto unit_k_end = unit_k_start + k_tiles_per_unit;
+      if (k_tile - start_k_tile < Params::min_iters_per_sk_unit_ &&
+          unit_k_end - start_k_tile < Params::min_iters_per_sk_unit_) {
+        // k_tile is within the first min_iters_per_sk_unit_ K tiles of this output tile,
+        // and the stream-K unit computes fewer than min_iters_per_sk_unit_ K tiles for this
+        // output tile. This work will thus be subsumed by the next stream-K unit.
+        ++unit_idx;
+      }
+
+      if (end_k_tile + 1 - k_tile < Params::min_iters_per_sk_unit_ &&
+          end_k_tile + 1 - unit_k_start < Params::min_iters_per_sk_unit_) {
+        // k_tile is within the last min_iters_per_sk_unit_ K tiles of this output tile,
+        // and the stream-K unit computes fewer than min_iters_per_sk_unit_ K tiles for this
+        // output tile. This work will thus be subsumed by the previous stream-K unit.
+        --unit_idx;
+      }
+
+      return unit_idx;
+    };
+
+    // Lambda to find the ID of the stream-K unit that computes this K tile
+    auto find_unit = [&](uint32_t k_tile) {
+      if (k_tile < big_unit_k_tiles) {
+        // The tile is within the "big unit range"
+        auto unit_idx = params.divmod_k_tiles_per_sk_big_unit_.divide(k_tile);
+        return static_cast<uint64_t>(adjust_unit(k_tile, unit_idx, params.divmod_k_tiles_per_sk_big_unit_.divisor));
+      }
+      else {
+        // The tile is after the "big unit range." Account for this by finding the "normal unit"
+        // that it belongs to, and then offsetting by the number of big units
+        auto unit_idx = params.divmod_k_tiles_per_sk_unit_.divide(k_tile - big_unit_k_tiles) + params.big_units_;
+        return static_cast<uint64_t>(adjust_unit(k_tile, unit_idx, params.divmod_k_tiles_per_sk_unit_.divisor));
+      }
+    };
+
+    return cute::make_tuple(find_unit(start_k_tile), find_unit(cur_k_tile), find_unit(end_k_tile));
+  }*/
+};
+
+} // namespace cutlass::gemm::kernel::detail
diff --git a/include/cutlass/gemm/kernel/tile_scheduler.hpp b/include/cutlass/gemm/kernel/tile_scheduler.hpp
index 9835e37fc8..e354e7a48c 100644
--- a/include/cutlass/gemm/kernel/tile_scheduler.hpp
+++ b/include/cutlass/gemm/kernel/tile_scheduler.hpp
@@ -40,6 +40,9 @@
 #include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
 #include "cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp"
 #include "cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp"
+#if defined (SYCL_INTEL_TARGET)
+#include "cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp"
+#endif
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass::gemm {
@@ -127,6 +130,19 @@ struct TileSchedulerSelector<
   using Scheduler = PersistentTileSchedulerSm90StreamK<TileShape, ClusterShape>;
 };
 
+template <
+  class TileShape,
+  class ClusterShape
+>
+struct TileSchedulerSelector<
+  StreamKScheduler,
+  arch::IntelPVC,
+  TileShape,
+  ClusterShape
+  > {
+  using Scheduler = PersistentTileSchedulerIntelPVCStreamK<TileShape>;
+};
+
 template <
   class TileShape,
   class ClusterShape
diff --git a/include/cutlass/workspace.h b/include/cutlass/workspace.h
index 31c48435b1..bb74826805 100644
--- a/include/cutlass/workspace.h
+++ b/include/cutlass/workspace.h
@@ -61,7 +61,9 @@ zero_workspace(void* workspace, size_t workspace_size, cudaStream_t stream = nul
 
     CUTLASS_TRACE_HOST("  clearing workspace");
 
-#if defined(CUTLASS_ENABLE_CUDA_HOST_ADAPTER) && CUTLASS_ENABLE_CUDA_HOST_ADAPTER
+#if defined CUTLASS_ENABLE_SYCL
+    syclcompat::memset_async(workspace, 0, workspace_size);
+#elif defined(CUTLASS_ENABLE_CUDA_HOST_ADAPTER) && CUTLASS_ENABLE_CUDA_HOST_ADAPTER
     //
     // Use the cuda host adapter
     //

From ce8b3a27be55fbc77eb0674b08bd5a48c38dc353 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Thu, 29 Aug 2024 00:20:09 +0100
Subject: [PATCH 02/30] fixed starting index calculation

---
 include/cutlass/barrier.h                     | 22 ++++++++++---------
 ...rsistent_tile_scheduler_params_streamk.hpp | 15 +++++++------
 .../intel_pvc_tile_scheduler_streamk.hpp      | 16 +++++++++-----
 3 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/include/cutlass/barrier.h b/include/cutlass/barrier.h
index 7d6a0ec902..9cf7343976 100644
--- a/include/cutlass/barrier.h
+++ b/include/cutlass/barrier.h
@@ -98,11 +98,10 @@ struct GenericBarrier {
     int state = 0;
 
 #ifdef SYCL_INTEL_TARGET
-    using atomicT = sycl::atomic_ref<int, sycl::memory_order::relaxed, 
+    auto atm = sycl::atomic_ref<int, sycl::memory_order::relaxed, 
                                      sycl::memory_scope::device, 
-                                     sycl::access::address_space::global_space>;
-    auto atm = atomicT(*ptr);
-    return atm.load();
+                                     sycl::access::address_space::global_space>(*ptr);
+    return atm.load(sycl::memory_order::acquire);
 #elif defined (__CUDA_ARCH__ >= 700)
     /// SM70 and newer use memory consistency qualifiers
 
@@ -146,8 +145,8 @@ struct GenericBarrier {
     if (thread_idx == 0)
     {
         // Spin-loop
-        // #pragma unroll 1
-        // while(ld_acquire(flag_ptr) < count) {}
+        #pragma unroll 1
+        while(ld_acquire(flag_ptr) < count) {}
     }
 
     Sync::sync();
@@ -161,9 +160,10 @@ struct GenericBarrier {
 
     if (thread_idx == 0)
     {
+        //printf("BlockID: %lu | wait_eq: %d\n", BlockIdxY(), val);
         // Spin-loop
-        // #pragma unroll 1
-        // while(ld_acquire(flag_ptr) != val) {}
+        #pragma unroll 1
+        while(ld_acquire(flag_ptr) != val) {break;}
     }
     Sync::sync();
   }
@@ -175,9 +175,10 @@ struct GenericBarrier {
 
     if (thread_idx == 0)
     {
+        //printf("BlockID: %lu | wait_eq_reset: %d\n", BlockIdxY(), val);
         // Spin-loop
-        // #pragma unroll 1
-        // while(atomicCAS(flag_ptr, val, 0) != val) {}
+        #pragma unroll 1
+        while(atomicCAS(flag_ptr, val, 0) != val) {}
     }
 
     Sync::sync();
@@ -193,6 +194,7 @@ struct GenericBarrier {
 
     if (thread_idx == 0)
     {
+      //printf("BlockID: %lu | arrive_inc_val: %d\n", BlockIdxY(), val);
       red_release(flag_ptr, val);
     }
   }
diff --git a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp
index 478c6dbb3a..0d22afeb8e 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp
@@ -140,7 +140,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
   static constexpr uint32_t min_iters_per_sk_unit_ = 8u;
 
   // Maximum number of groups of stream-K units
-  // static constexpr uint32_t max_sk_groups_ = 8u;
+  static constexpr uint32_t max_sk_groups_ = 1u;
 
   // ktile start from even for each cta
   uint32_t ktile_start_alignment_count { 1u };
@@ -351,7 +351,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
       groups = fallback_groups;
     }*/
 
-    uint32_t groups = 1;
+    uint32_t groups = max_sk_groups_;
 
     auto sk_units_per_group = sk_units / groups;
 
@@ -427,8 +427,6 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
   cute::tuple<int32_t, int32_t>
   get_work_idx_m_and_n(
       uint64_t blk_per_grid_dim,
-      FastDivmodU64Pow2 const& divmod_cluster_shape_major,
-      FastDivmodU64Pow2 const& divmod_cluster_shape_minor,
       FastDivmodU64 const& divmod_cluster_blk_major) {
 
     uint64_t m_idx, n_idx;
@@ -471,9 +469,12 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
     dim3 problem_blocks,
     KernelHardwareInfo hw_info
   ) {
-    uint32_t available_sms = hw_info.sm_count / 8;
-    uint32_t dimx = ((problem_blocks.x * problem_blocks.y) + available_sms - 1) / available_sms;
-    return dim3{available_sms, dimx, problem_blocks.z};
+    uint32_t available_sms = 32;//hw_info.sm_count / 8;
+    // printf("available_sms: %d\n", available_sms);
+    auto possibly_truncate = [&](int x, int y) {
+      return static_cast<unsigned int>(platform::min(x, y));
+    };
+    return dim3{1, possibly_truncate(available_sms, problem_blocks.x * problem_blocks.y * problem_blocks.z), 1};
   }
 
   // Returns the number of stream-K tiles that will be computed amongst `output_tiles` total
diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
index 91174cfc89..e1f5edb64c 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
@@ -185,7 +185,8 @@ class PersistentTileSchedulerIntelPVCStreamK {
 
   CUTLASS_HOST_DEVICE
   PersistentTileSchedulerIntelPVCStreamK(Params const& params_) : scheduler_params(params_) {
-    current_work_linear_idx_ = uint64_t(BlockIdxX()) + uint64_t(BlockIdxY()) * uint64_t(GridDimX());
+    // current_work_linear_idx_ = uint64_t(BlockIdxX()) + uint64_t(BlockIdxY()) * uint64_t(GridDimX());
+    current_work_linear_idx_ = uint64_t(BlockIdxY());
   }
 
   CUTLASS_DEVICE
@@ -580,8 +581,8 @@ CUTLASS_DEVICE
       // auto big_units_in_group = params.div_cluster_size(
       //   k_tiles_in_group - (k_tiles_per_unit_in_group * params.divmod_sk_units_per_group_.divisor));
 
-      // uint64_t split;
-      // params.divmod_clusters_mnl_(split, cluster_linear_work_idx, cluster_linear_work_idx);
+      uint64_t split;
+      params.divmod_sk_units_per_group_(split, output_tile_id, output_tile_id);
 
       bool is_split_k = params.divmod_splits_.divisor > 1;
       auto big_unit_cmp_lhs = output_tile_id;
@@ -590,7 +591,7 @@ CUTLASS_DEVICE
       auto k_tiles_per_split = is_split_k ? params.divmod_k_tiles_per_sk_unit_.divisor : k_tiles_per_unit_in_group;
 
       // Determine the starting k iteration computed by this stream-K work unit
-      uint32_t unit_iter_start = (linear_idx_mult * linear_idx) + k_tiles_per_split;
+      uint32_t unit_iter_start = (linear_idx_mult * linear_idx) + (k_tiles_per_split * split);
 
       // Adjust the starting position and number of k iterations for "big units," which
       // compute one extra iteration. If there are any big units, they will be the first
@@ -733,8 +734,6 @@ CUTLASS_DEVICE
 
     auto [work_idx_m, work_idx_n] = Params::get_work_idx_m_and_n(
                                           cta_per_grid_dim,
-                                          params.divmod_cluster_shape_major_,
-                                          params.divmod_cluster_shape_minor_,
                                           params.divmod_cluster_blk_major_
                                         );
 
@@ -742,6 +741,11 @@ CUTLASS_DEVICE
     work_tile_info.M_idx = work_idx_m;
     work_tile_info.N_idx = work_idx_n;
     work_tile_info.L_idx = work_idx_l;
+
+    // if(linear_idx >= 32 && ThreadIdxX() == 0)
+    //   printf("BlockID: %lu | k_tile_count: %d | M_idx: %lu | N_idx: %lu | K_idx: %lu | L_idx: %lu | ctas_per_grid_dim: %lu | output_tile_id: %lu\n",
+    //         BlockIdxY(), work_tile_info.k_tile_count, work_tile_info.M_idx, work_tile_info.N_idx, work_tile_info.K_idx,
+    //         work_tile_info.L_idx, remainder, output_tile_id);
   }
 
   // Returns the starting and ending peer ID of this tile

From 9599d62b71dabb32f0aa8a8557527be03b335c79 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Thu, 29 Aug 2024 01:41:01 +0100
Subject: [PATCH 03/30] Fixed barrier count update

---
 include/cutlass/barrier.h                              |  5 +----
 .../gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp   | 10 +++++-----
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/include/cutlass/barrier.h b/include/cutlass/barrier.h
index 9cf7343976..f561ec664e 100644
--- a/include/cutlass/barrier.h
+++ b/include/cutlass/barrier.h
@@ -160,10 +160,9 @@ struct GenericBarrier {
 
     if (thread_idx == 0)
     {
-        //printf("BlockID: %lu | wait_eq: %d\n", BlockIdxY(), val);
         // Spin-loop
         #pragma unroll 1
-        while(ld_acquire(flag_ptr) != val) {break;}
+        while(ld_acquire(flag_ptr) != val) {}
     }
     Sync::sync();
   }
@@ -175,7 +174,6 @@ struct GenericBarrier {
 
     if (thread_idx == 0)
     {
-        //printf("BlockID: %lu | wait_eq_reset: %d\n", BlockIdxY(), val);
         // Spin-loop
         #pragma unroll 1
         while(atomicCAS(flag_ptr, val, 0) != val) {}
@@ -194,7 +192,6 @@ struct GenericBarrier {
 
     if (thread_idx == 0)
     {
-      //printf("BlockID: %lu | arrive_inc_val: %d\n", BlockIdxY(), val);
       red_release(flag_ptr, val);
     }
   }
diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
index e1f5edb64c..af929c2b87 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
@@ -294,7 +294,7 @@ template <int SubgroupSize, class FrgTensorC>
     uint32_t num_barriers = 1,
     uint32_t barrier_idx = 0) {
     static constexpr uint32_t Offset = static_cast<int>(cutlass::arch::ReservedNamedBarriers::StreamkBarrier0);
-    static constexpr uint32_t MaxNumNamedBarriers = 2;
+    static constexpr uint32_t MaxNumNamedBarriers = 1;
     using BarrierManager = NamedBarrierManager<SubgroupSize, Offset, MaxNumNamedBarriers>;
     return fixup_helper<SubgroupSize, FrgTensorC, BarrierManager>(
       params, work_tile_info, subgroup_id, accumulators, num_barriers, barrier_idx);
@@ -371,7 +371,7 @@ template <int SubgroupSize, class FrgTensorC>
       }
       else {
         // Wait until the preceding split added its accumulators
-        BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx);
+        BarrierManager::wait_eq(barrier_idx, lock_workspace, ThreadIdxX(), lock_idx, work_tile_info.K_idx);
 
         // Perform reduction in workspace
         BlockStripedReduceT::reduce(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx);
@@ -382,16 +382,16 @@ template <int SubgroupSize, class FrgTensorC>
       int32_t increment = work_tile_info.k_tile_count;
 
       // Signal our arrival
-      BarrierManager::arrive_inc(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, increment);
+      BarrierManager::arrive_inc(barrier_idx, lock_workspace, ThreadIdxX(), lock_idx, increment);
     }
     else {
       if (params.reduction_mode_ == ReductionMode::Deterministic) {
         // Wait until the preceding split added its accumulators
-        BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx);
+        BarrierManager::wait_eq(barrier_idx, lock_workspace, ThreadIdxX(), lock_idx, work_tile_info.K_idx);
       }
       else {
         // Wait unitl the first split has stored its accumulators
-        BarrierManager::wait_lt(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, 1);
+        BarrierManager::wait_lt(barrier_idx, lock_workspace, ThreadIdxX(), lock_idx, 1);
       }
 
       // The block computing the final split for the tile adds previously-reduced partials

From 76e54f9d3d26cf52116a6cffaac1125e149e8237 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Thu, 29 Aug 2024 11:54:15 +0100
Subject: [PATCH 04/30] Fixed compilation for normal GEMM

---
 include/cutlass/gemm/kernel/intel_pvc_gemm.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp b/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp
index 24f799a8a7..a7924db7bb 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp
@@ -53,7 +53,7 @@ class GemmUniversal<
   CollectiveEpilogue_,
   TileScheduler_,
   cute::enable_if_t<cute::is_base_of_v<KernelPVC, typename CollectiveMainloop_::DispatchPolicy::Schedule> 
-                    && cute::is_same_v<TileScheduler_, cutlass::gemm::PersistentScheduler>>>
+                    && !cute::is_same_v<TileScheduler_, cutlass::gemm::StreamKScheduler>>>
 {
 public:
   //

From b072873ab17af0fa734f3e52ab50eeaed7528a66 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Thu, 29 Aug 2024 12:40:40 +0100
Subject: [PATCH 05/30] Perform fixup using threadid instead of subgroup_id

---
 .../gemm/kernel/intel_pvc_gemm_streamk.hpp    |  4 +--
 .../intel_pvc_tile_scheduler_streamk.hpp      | 27 +++++++++----------
 2 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp
index 4b0e7dbff9..bd35b59e57 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp
@@ -339,8 +339,8 @@ class GemmUniversal<
       );
 
       // Perform reduction across splits, if needed
-      TileScheduler::template fixup<SubgroupSize>(
-        params.scheduler, work_tile_info, sub_group_id, accumulators);
+      TileScheduler::template fixup<MaxThreadsPerBlock>(
+        params.scheduler, work_tile_info, accumulators);
 
       if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
         CollectiveEpilogue epilogue{params.epilogue, shared_storage.epilogue};
diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
index af929c2b87..b01dd4e882 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
@@ -283,31 +283,29 @@ class PersistentTileSchedulerIntelPVCStreamK {
   }
 
   // Performs the reduction across splits for a given output tile.
-template <int SubgroupSize, class FrgTensorC>
+template <int ThreadsPerBlock, class FrgTensorC>
   CUTLASS_DEVICE
   static void
   fixup(
     Params const& params,
     WorkTileInfo const& work_tile_info,
-    const uint32_t subgroup_id,
     FrgTensorC& accumulators,
     uint32_t num_barriers = 1,
     uint32_t barrier_idx = 0) {
     static constexpr uint32_t Offset = static_cast<int>(cutlass::arch::ReservedNamedBarriers::StreamkBarrier0);
     static constexpr uint32_t MaxNumNamedBarriers = 1;
-    using BarrierManager = NamedBarrierManager<SubgroupSize, Offset, MaxNumNamedBarriers>;
-    return fixup_helper<SubgroupSize, FrgTensorC, BarrierManager>(
-      params, work_tile_info, subgroup_id, accumulators, num_barriers, barrier_idx);
+    using BarrierManager = NamedBarrierManager<ThreadsPerBlock, Offset, MaxNumNamedBarriers>;
+    return fixup_helper<ThreadsPerBlock, FrgTensorC, BarrierManager>(
+      params, work_tile_info, accumulators, num_barriers, barrier_idx);
   }
 
   // Helper for performing the reduction across splits for a given output tile.
-  template <int SubgroupSize, class FrgTensorC, class BarrierManager>
+  template <int ThreadsPerBlock, class FrgTensorC, class BarrierManager>
   CUTLASS_DEVICE
   static void
   fixup_helper(
     Params const& params,
     WorkTileInfo const& work_tile_info,
-    const uint32_t subgroup_id,
     FrgTensorC& accumulators,
     uint32_t num_barriers,
     uint32_t barrier_idx,
@@ -326,24 +324,23 @@ template <int SubgroupSize, class FrgTensorC>
     auto reduction_tile_idx = tile_idx;
     auto [first_peer_id, my_peer_id, last_peer_id] = tile_peer_range(params, tile_idx, static_cast<uint32_t>(work_tile_info.K_idx));
     auto reduction_peer_offset = 0;
+    int barrier_group_thread_idx = ThreadIdxX();
 
     // Reductions use BlockStripedReduce with a width of BarrierManager::ThreadCount under the hood.
     // Thus, the start of the reduction space is the same across all threads in a warp group.
     int reduction_offset =
       (cute::size<0>(TileShape{}) * cute::size<1>(TileShape{}) * reduction_tile_idx * num_accumulator_mtxs) +
       reduction_peer_offset +
-      (size(accumulators) * subgroup_id * SubgroupSize);
+      (size(accumulators) * barrier_group_thread_idx * ThreadsPerBlock);
 
     ElementAccumulator* group_reduction_workspace = reinterpret_cast<ElementAccumulator*>(params.reduction_workspace_) + reduction_offset;
 
     using AccumulatorArrayT = Array<typename FrgTensorC::value_type, size(FrgTensorC{})>;
-    using BlockStripedReduceT = BlockStripedReduce<SubgroupSize, AccumulatorArrayT>;
+    using BlockStripedReduceT = BlockStripedReduce<ThreadsPerBlock, AccumulatorArrayT>;
 
     AccumulatorArrayT* reduction_workspace_array = reinterpret_cast<AccumulatorArrayT*>(group_reduction_workspace);
     AccumulatorArrayT* accumulator_array = reinterpret_cast<AccumulatorArrayT*>(accumulators.data());
 
-    int barrier_group_thread_idx = ThreadIdxX() % SubgroupSize;
-
     // The number of tiles for which reduction is required is either:
     //   (a) the total number of output tiles (in the case of split-K)
     //   (b) the number of stream-K tiles (potentially multiplied by peer count if using separate reduction)
@@ -371,7 +368,7 @@ template <int SubgroupSize, class FrgTensorC>
       }
       else {
         // Wait until the preceding split added its accumulators
-        BarrierManager::wait_eq(barrier_idx, lock_workspace, ThreadIdxX(), lock_idx, work_tile_info.K_idx);
+        BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx);
 
         // Perform reduction in workspace
         BlockStripedReduceT::reduce(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx);
@@ -382,16 +379,16 @@ template <int SubgroupSize, class FrgTensorC>
       int32_t increment = work_tile_info.k_tile_count;
 
       // Signal our arrival
-      BarrierManager::arrive_inc(barrier_idx, lock_workspace, ThreadIdxX(), lock_idx, increment);
+      BarrierManager::arrive_inc(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, increment);
     }
     else {
       if (params.reduction_mode_ == ReductionMode::Deterministic) {
         // Wait until the preceding split added its accumulators
-        BarrierManager::wait_eq(barrier_idx, lock_workspace, ThreadIdxX(), lock_idx, work_tile_info.K_idx);
+        BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx);
       }
       else {
         // Wait unitl the first split has stored its accumulators
-        BarrierManager::wait_lt(barrier_idx, lock_workspace, ThreadIdxX(), lock_idx, 1);
+        BarrierManager::wait_lt(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, 1);
       }
 
       // The block computing the final split for the tile adds previously-reduced partials

From 6be72be8344a7a1cf44005a4c98b4637385eabfc Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Thu, 29 Aug 2024 15:14:58 +0100
Subject: [PATCH 06/30] Fixed the k_idx offset for MMA atom and corrected the
 reduction offset calculation

---
 examples/sycl/pvc/pvc_gemm.cpp                               | 2 ++
 include/cutlass/barrier.h                                    | 2 +-
 include/cutlass/functional.h                                 | 2 +-
 include/cutlass/gemm/collective/intel_pvc_mma.hpp            | 5 +++--
 .../cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp | 5 ++---
 include/cutlass/gpu_generics.h                               | 4 ++--
 6 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/examples/sycl/pvc/pvc_gemm.cpp b/examples/sycl/pvc/pvc_gemm.cpp
index 2d3d2665f5..3cd69255eb 100644
--- a/examples/sycl/pvc/pvc_gemm.cpp
+++ b/examples/sycl/pvc/pvc_gemm.cpp
@@ -244,6 +244,8 @@ struct ExampleRunner {
       GPU_Clock timer;
       timer.start();
       for (int i = 0; i < options.iterations; ++i) {
+        if(workspace_size > 0)
+          gemm_op.initialize(arguments, workspace.get());
         gemm_op.run();
       }
       syclcompat::wait();
diff --git a/include/cutlass/barrier.h b/include/cutlass/barrier.h
index f561ec664e..d5dfaf4a63 100644
--- a/include/cutlass/barrier.h
+++ b/include/cutlass/barrier.h
@@ -98,7 +98,7 @@ struct GenericBarrier {
     int state = 0;
 
 #ifdef SYCL_INTEL_TARGET
-    auto atm = sycl::atomic_ref<int, sycl::memory_order::relaxed, 
+    auto atm = sycl::atomic_ref<int, sycl::memory_order::acq_rel, 
                                      sycl::memory_scope::device, 
                                      sycl::access::address_space::global_space>(*ptr);
     return atm.load(sycl::memory_order::acquire);
diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h
index 9fa7bf7ba1..48ec345cec 100644
--- a/include/cutlass/functional.h
+++ b/include/cutlass/functional.h
@@ -782,7 +782,7 @@ struct atomic_add
   CUTLASS_DEVICE
   void operator()(T *ptr, const T &data)
   {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(CUTLASS_ENABLE_SYCL)
     atomicAdd(ptr, data);
 #endif
   }
diff --git a/include/cutlass/gemm/collective/intel_pvc_mma.hpp b/include/cutlass/gemm/collective/intel_pvc_mma.hpp
index 38239aaddc..a12a05ded2 100644
--- a/include/cutlass/gemm/collective/intel_pvc_mma.hpp
+++ b/include/cutlass/gemm/collective/intel_pvc_mma.hpp
@@ -231,7 +231,7 @@ struct CollectiveMma<
     //
     // Mainloop
     //
-    const int k_start_idx = crd2idx((*k_tile_iter), make_shape(K));
+    const int k_start_idx = crd2idx((*k_tile_iter), make_shape(K)) * get<2>(SubgroupTileShape{});
     int prefetch_k = k_start_idx;
 
     for (int i = 0; i < DispatchPolicy::Stages; i++) {
@@ -241,13 +241,14 @@ struct CollectiveMma<
     }
 
     for (int k_tile = 0, k = k_start_idx; k_tile < k_tile_count;
-         ++k_tile, k += get<2>(SubgroupTileShape{}), prefetch_k += get<2>(SubgroupTileShape{})) {
+         ++k_tile, k += get<2>(SubgroupTileShape{})) {
       // Copy gmem to rmem for the first k_tile
       copy(mainloop.gmem_tiled_copy_a, gA(_, _, k), tAr);
       copy(mainloop.gmem_tiled_copy_b, gB(_, _, k), tBr);
 
       prefetch(mainloop.gmem_tiled_copy_a, tAi(_, _, prefetch_k));
       prefetch(mainloop.gmem_tiled_copy_b, tBi(_, _, prefetch_k));
+      prefetch_k += get<2>(SubgroupTileShape{});
 
       cute::gemm(tiled_mma, accum, tAr_view, tBr_view, src_accum);
     }
diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
index b01dd4e882..4a4d739b71 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
@@ -330,8 +330,7 @@ template <int ThreadsPerBlock, class FrgTensorC>
     // Thus, the start of the reduction space is the same across all threads in a warp group.
     int reduction_offset =
       (cute::size<0>(TileShape{}) * cute::size<1>(TileShape{}) * reduction_tile_idx * num_accumulator_mtxs) +
-      reduction_peer_offset +
-      (size(accumulators) * barrier_group_thread_idx * ThreadsPerBlock);
+      reduction_peer_offset;
 
     ElementAccumulator* group_reduction_workspace = reinterpret_cast<ElementAccumulator*>(params.reduction_workspace_) + reduction_offset;
 
@@ -739,7 +738,7 @@ CUTLASS_DEVICE
     work_tile_info.N_idx = work_idx_n;
     work_tile_info.L_idx = work_idx_l;
 
-    // if(linear_idx >= 32 && ThreadIdxX() == 0)
+    // if(linear_idx < 32 && ThreadIdxX() == 0)
     //   printf("BlockID: %lu | k_tile_count: %d | M_idx: %lu | N_idx: %lu | K_idx: %lu | L_idx: %lu | ctas_per_grid_dim: %lu | output_tile_id: %lu\n",
     //         BlockIdxY(), work_tile_info.k_tile_count, work_tile_info.M_idx, work_tile_info.N_idx, work_tile_info.K_idx,
     //         work_tile_info.L_idx, remainder, output_tile_id);
diff --git a/include/cutlass/gpu_generics.h b/include/cutlass/gpu_generics.h
index 2c32760e83..a406791437 100644
--- a/include/cutlass/gpu_generics.h
+++ b/include/cutlass/gpu_generics.h
@@ -314,8 +314,8 @@ using cudaStream_t = void *;
 using dim3 = syclcompat::dim3;
 
 // Atomic
-
-CUTLASS_DEVICE int atomicAdd(int *address, int val) {
+template <typename T>
+CUTLASS_DEVICE T atomicAdd(T *address, T val) {
 #if defined(__SYCL_DEVICE_ONLY__)
   return syclcompat::atomic_fetch_add(address, val);
 #endif

From e00f9dade4bae2bbf19ac47df9fa9620c08fb9a3 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Thu, 29 Aug 2024 16:34:20 +0100
Subject: [PATCH 07/30] Use log2 for available_xecores

---
 .../gemm/kernel/intel_pvc_gemm_streamk.hpp    | 30 +++++---
 ...rsistent_tile_scheduler_params_streamk.hpp | 76 +------------------
 .../intel_pvc_tile_scheduler_streamk.hpp      |  4 +-
 3 files changed, 23 insertions(+), 87 deletions(-)

diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp
index bd35b59e57..0f597a92cf 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp
@@ -289,9 +289,25 @@ class GemmUniversal<
     constexpr auto workgroup_shape = WorkgroupTileShape{};                                                  // (BLK_M,BLK_N,BLK_K)
     constexpr auto subgroup_shape = SubgroupTileShape{};                                                  // (SUB_M,SUB_N,SUB_K)
 
+    constexpr int version =
+        is_same_v<typename CollectiveMainloop::GmemTiledCopyB,
+                  XE_2D_U16x16x16x2x1_V>
+            ? 1
+            : 2;
+
+    auto k_residue   = K - get<2>(subgroup_shape) * (K / get<2>(subgroup_shape));        // K - SUB_K * k_coord_max
+
+    TiledMma tiled_mma;
+    CollectiveMainloop collective_mma;
+    CollectiveEpilogue epilogue{params.epilogue, shared_storage.epilogue};
+
+    Tensor accumulators = make_tensor<ElementAccumulator>(Shape<Int<VecC>, Int<FragsM>, Int<FragsN>>{});
+
+    const int m_offset = sub_group_id / CollectiveMainloop::sg_per_wg_n * get<0>(subgroup_shape);
+    const int n_offset = sub_group_id % CollectiveMainloop::sg_per_wg_n * get<1>(subgroup_shape);
     while (work_tile_info.is_valid()) {
-      const int m_coord = work_tile_info.M_idx * get<0>(workgroup_shape) + sub_group_id / CollectiveMainloop::sg_per_wg_n * get<0>(subgroup_shape);
-      const int n_coord = work_tile_info.N_idx * get<1>(workgroup_shape) + sub_group_id % CollectiveMainloop::sg_per_wg_n * get<1>(subgroup_shape);
+      const int m_coord = work_tile_info.M_idx * get<0>(workgroup_shape) + m_offset;
+      const int n_coord = work_tile_info.N_idx * get<1>(workgroup_shape) + n_offset;
       const int l_coord = work_tile_info.L_idx;
 
       // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
@@ -304,11 +320,6 @@ class GemmUniversal<
               make_coord(m_coord, 0, 0),
               make_shape(_1{}, K, L),
               make_stride(Int<FragsM>{} * get<0>(MmaAtomShape()),_1{}));
-      constexpr int version =
-          is_same_v<typename CollectiveMainloop::GmemTiledCopyB,
-                    XE_2D_U16x16x16x2x1_V>
-              ? 1
-              : 2;
 
       Tensor tBi = params.mainloop.gmem_tiled_copy_b.get_pvc_tensor(
               make_coord(n_coord, 0, 0),
@@ -318,13 +329,10 @@ class GemmUniversal<
       // Compute tile residues for predication
       auto m_max_coord = M - get<0>(subgroup_shape) * m_coord;                             // M - SUB_M * m_coord
       auto n_max_coord = N - get<1>(subgroup_shape) * n_coord;                             // N - SUB_N * n_coord
-      auto k_residue   = K - get<2>(subgroup_shape) * (K / get<2>(subgroup_shape));        // K - SUB_K * k_coord_max
       auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue);
 
-      Tensor accumulators = make_tensor<ElementAccumulator>(Shape<Int<VecC>, Int<FragsM>, Int<FragsN>>{});
       clear(accumulators);
 
-      CollectiveMainloop collective_mma;
       // Perform the collective scoped MMA
       collective_mma(
         accumulators,
@@ -343,8 +351,6 @@ class GemmUniversal<
         params.scheduler, work_tile_info, accumulators);
 
       if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
-        CollectiveEpilogue epilogue{params.epilogue, shared_storage.epilogue};
-        TiledMma tiled_mma;
         epilogue(
           problem_shape_MNKL,
           subgroup_shape,
diff --git a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp
index 0d22afeb8e..7e5e22e09b 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp
@@ -84,10 +84,8 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
     StreamK
   };
 
-  FastDivmodU64Pow2 divmod_cluster_shape_major_{};
-  FastDivmodU64Pow2 divmod_cluster_shape_minor_{};
   FastDivmodU64 divmod_batch_{};
-  FastDivmodU64 divmod_cluster_blk_major_{};
+  FastDivmodU64 divmod_blk_major_{};
 
   // We divide up the number of stream-K tiles amongst G groups of stream-K units.
   // The stream-K units within a group collaborate to comptue over the `sk_tiles / G`
@@ -292,65 +290,6 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
       return;
     }
 
-    bool do_separate_reduction = false; 
-    // should_perform_separate_reduction(
-    //   epilogue_subtile, sk_units, sk_tiles, dp_tiles, ctas_per_wave);
-
-    // Determine the number of stream-K groups that will be used. Choosing the
-    // fast moving dimension of the underlying grid.
-    /*uint32_t max_groups_problem = problem_blocks_n;
-
-    // Select the number of groups that will be use. We start with the maximum
-    // number of potential groups, and iterate down looking for a group size that
-    // evenly divides the stream-K units and tiles, and for which the resulting
-    // number of K tiles per stream-K unit remains above min_iters_per_sk_unit_
-
-    uint32_t groups = platform::min(max_groups_problem, uint32_t(max_sk_groups_));
-
-    // Grouping is disabled when separate reduction is used
-    // if (do_separate_reduction) {
-    //   groups = 1;
-    // }
-
-    uint32_t fallback_groups = 0;
-
-    auto sk_splits_too_small = [&](uint32_t g) {
-      // Check whether the number of K tiles computed per stream-K unit is less
-      // than min_iters_per_sk_unit_
-      auto total_sk_tiles = sk_tiles / g;
-      auto total_sk_k_tiles = total_sk_tiles * k_tiles_per_output_tile;
-      auto k_tiles_per_sk_unit = total_sk_k_tiles / (sk_units / g);
-      return k_tiles_per_sk_unit < min_iters_per_sk_unit_;
-    };
-
-    auto is_ideal_grouping = [&](uint32_t g) {
-      // An ideal grouping will evenly divide stream-K clusters, evenly divide
-      // stream-K tiles, and not result in stream-K splits that are too small.
-      return (sk_units % g == 0) && (sk_tiles % g == 0) && !sk_splits_too_small(g);
-    };
-
-    auto is_valid_grouping = [&](uint32_t g) {
-      // A grouping is valid, but not ideal, if it evenly divides the
-      // stream-K clusters and does not result in stream-K splits that are
-      // too small. Such a setting can be used as a fallback option in the
-      // case that an ideal grouping is not achievable
-      return sk_units % g == 0 && !sk_splits_too_small(g);
-    };
-
-    while (groups > 1 && !is_ideal_grouping(groups)) {
-      if (fallback_groups == 0 && is_valid_grouping(groups)) {
-        // Set fallback groups once in preference for a larger number of groups.
-        fallback_groups = groups;
-      }
-      --groups;
-    }
-
-    // If groups == 1, we did not find a group count that satisfies all criteria. If we have
-    // found a fallback group count, use this instead.
-    if (groups == 1 && fallback_groups > 0) {
-      groups = fallback_groups;
-    }*/
-
     uint32_t groups = max_sk_groups_;
 
     auto sk_units_per_group = sk_units / groups;
@@ -374,13 +313,6 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
 
     // Use separate reduction when we have less than one wave of output tiles (dp_tiles == 0)
     // and when each tile will be operated on by at least two stream-K units (sk_units > 2 * sk_tiles)
-    // if (do_separate_reduction) {
-    //   // Each reduction unit will reduce the partials of an epilogue subtile for
-    //   // a given output tile and compute the epilogue. Thus, there are as many reduction
-    //   // units as there are epilogue subtiles.
-    //   reduction_units = sk_tiles * epilogue_subtile;
-    // }
-    // else 
     if (decomposition_mode == DecompositionMode::Heuristic && sk_tiles < sk_units && sk_units % sk_tiles == 0) {
       // If the number of stream-K units is a multiple of the number of stream-K tiles, then
       // the problem can leverage a basic split-K decomposition for the stream-K tiles.
@@ -403,9 +335,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
     divmod_sk_groups_ = FastDivmodU64(static_cast<uint64_t>(groups));
     divmod_sk_units_per_group_ = FastDivmodU64(static_cast<uint64_t>(sk_units / groups));
 
-    divmod_cluster_shape_major_ = FastDivmodU64Pow2(1);
-    divmod_cluster_shape_minor_ = FastDivmodU64Pow2(1);
-    divmod_cluster_blk_major_ = FastDivmodU64(problem_blocks_n);
+    divmod_blk_major_ = FastDivmodU64(problem_blocks_n);
 
     divmod_splits_ = FastDivmod(splits);
     units_per_problem_ = static_cast<uint32_t>(dp_units + sk_units);
@@ -469,7 +399,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
     dim3 problem_blocks,
     KernelHardwareInfo hw_info
   ) {
-    uint32_t available_sms = 32;//hw_info.sm_count / 8;
+    uint32_t available_sms = 1 << find_log2(hw_info.sm_count / 8);
     // printf("available_sms: %d\n", available_sms);
     auto possibly_truncate = [&](int x, int y) {
       return static_cast<unsigned int>(platform::min(x, y));
diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
index 4a4d739b71..142bd3ad8f 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
@@ -415,7 +415,7 @@ template <int ThreadsPerBlock, class FrgTensorC>
   output_tile_index(Params const& params, WorkTileInfo const& work_tile_info) {
     uint64_t linear_idx_in_batch = Params::get_linear_idx_from_m_and_n(
       work_tile_info.M_idx, work_tile_info.N_idx,
-      params.divmod_cluster_blk_major_
+      params.divmod_blk_major_
     );
 
     uint64_t tiles_mn = params.divmod_batch_.divisor;
@@ -730,7 +730,7 @@ CUTLASS_DEVICE
 
     auto [work_idx_m, work_idx_n] = Params::get_work_idx_m_and_n(
                                           cta_per_grid_dim,
-                                          params.divmod_cluster_blk_major_
+                                          params.divmod_blk_major_
                                         );
 
     // Set the M, N, and L block offsets

From 224b31678724a260d032819770f231a83db35ddd Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Mon, 2 Sep 2024 12:03:17 +0100
Subject: [PATCH 08/30] SplitK working

---
 cmake/FindDPCPP.cmake                             |  1 +
 include/cutlass/arch/barrier.h                    | 15 ++++++++++++---
 include/cutlass/functional.h                      |  2 +-
 ...c_persistent_tile_scheduler_params_streamk.hpp |  4 ++++
 .../kernel/intel_pvc_tile_scheduler_streamk.hpp   | 13 +++++++------
 5 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake
index 6ed98d3e4b..a18eda051c 100644
--- a/cmake/FindDPCPP.cmake
+++ b/cmake/FindDPCPP.cmake
@@ -43,6 +43,7 @@ set(DPCPP_COMPILE_ONLY_FLAGS "")
 
 if(NOT "${DPCPP_SYCL_TARGET}" STREQUAL "")
   list(APPEND DPCPP_FLAGS "-fsycl-targets=${DPCPP_SYCL_TARGET};")
+  list(APPEND DPCPP_FLAGS "-Xspirv-translator;--spirv-ext=+SPV_INTEL_split_barrier;-DSPV_INTEL_split_barrier;")
 endif()
 
 if(NOT "${DPCPP_USER_FLAGS}" STREQUAL "")
diff --git a/include/cutlass/arch/barrier.h b/include/cutlass/arch/barrier.h
index cd2d7be3cb..0ac6da8813 100644
--- a/include/cutlass/arch/barrier.h
+++ b/include/cutlass/arch/barrier.h
@@ -36,7 +36,11 @@
 
 #include <cutlass/arch/memory_sm75.h>
 #include <cute/arch/cluster_sm90.hpp>
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && (__CUDACC_VER_MAJOR__ >= 12)
+
+#if defined SYCL_INTEL_TARGET
+SYCL_EXTERNAL __attribute__((convergent)) void __spirv_ControlBarrierWaitINTEL(int, int, int);
+SYCL_EXTERNAL __attribute__((convergent)) void __spirv_ControlBarrierArriveINTEL(int, int, int);
+#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && (__CUDACC_VER_MAJOR__ >= 12)
 #define CUDA_BARRIER_ENABLED 1
 #else
 #define CUDA_BARRIER_ENABLED 0
@@ -151,7 +155,10 @@ class NamedBarrier {
  private:
   CUTLASS_DEVICE
   static void arrive_and_wait_internal(uint32_t num_threads, uint32_t barrier_id) {
-#if CUDA_BARRIER_ENABLED
+#if defined SYCL_INTEL_TARGET
+    __spirv_ControlBarrierArriveINTEL(2, 2, 0x4);
+    __spirv_ControlBarrierWaitINTEL(2, 2, 0x2);
+#elif defined CUDA_BARRIER_ENABLED
     asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "r"(num_threads));
 #elif defined(__CUDA_ARCH__)
     asm volatile ("brkpt;\n" ::);
@@ -160,7 +167,9 @@ class NamedBarrier {
 
   CUTLASS_DEVICE
   static void arrive_internal(uint32_t num_threads, uint32_t barrier_id) {
-#if CUDA_BARRIER_ENABLED
+#if defined SYCL_INTEL_TARGET
+    __spirv_ControlBarrierArriveINTEL(2, 2, 0x4);
+#elif CUDA_BARRIER_ENABLED
     asm volatile("bar.arrive %0, %1;" : : "r"(barrier_id), "r"(num_threads));
 #elif defined(__CUDA_ARCH__)
     asm volatile ("brkpt;\n" ::);
diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h
index 48ec345cec..57835f8d35 100644
--- a/include/cutlass/functional.h
+++ b/include/cutlass/functional.h
@@ -782,7 +782,7 @@ struct atomic_add
   CUTLASS_DEVICE
   void operator()(T *ptr, const T &data)
   {
-#if defined(__CUDA_ARCH__) || defined(CUTLASS_ENABLE_SYCL)
+#if defined(__CUDA_ARCH__) || defined(__SYCL_DEVICE_ONLY__)
     atomicAdd(ptr, data);
 #endif
   }
diff --git a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp
index 7e5e22e09b..5691059817 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp
@@ -705,8 +705,11 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
 
     divmod_batch_ = FastDivmodU64(blocks_m * blocks_n);
     divmod_tiles_per_output_tile_ = FastDivmod(k_tiles_per_output_tile);
+    divmod_sk_groups_ = FastDivmodU64(1u);
     divmod_splits_ = FastDivmod(splits);
+    divmod_blk_major_ = FastDivmodU64(blocks_n);
     units_per_problem_ = blocks_m * blocks_n * blocks_l;
+    big_units_ = k_tiles_per_output_tile % splits;
     reduction_workspace_ = reduction_workspace;
     reduction_mode_ = reduction_mode;
     divmod_k_tiles_per_sk_unit_ = FastDivmod(k_tiles_per_output_tile / splits);
@@ -714,6 +717,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
     // No stream-K work is performed for "basic" data-parallel and split-K decompositions
     sk_tiles_ = 0;
     sk_units_ = 0;
+    divmod_sk_units_per_group_ = FastDivmodU64(blocks_m * blocks_n * blocks_l);
   }
 
   private:
diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
index 142bd3ad8f..b496ed33aa 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
@@ -322,7 +322,6 @@ template <int ThreadsPerBlock, class FrgTensorC>
     auto lock_idx = (tile_idx * num_barriers) + barrier_idx;
 
     auto reduction_tile_idx = tile_idx;
-    auto [first_peer_id, my_peer_id, last_peer_id] = tile_peer_range(params, tile_idx, static_cast<uint32_t>(work_tile_info.K_idx));
     auto reduction_peer_offset = 0;
     int barrier_group_thread_idx = ThreadIdxX();
 
@@ -386,7 +385,7 @@ template <int ThreadsPerBlock, class FrgTensorC>
         BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx);
       }
       else {
-        // Wait unitl the first split has stored its accumulators
+        // Wait until the first split has stored its accumulators
         BarrierManager::wait_lt(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, 1);
       }
 
@@ -732,16 +731,18 @@ CUTLASS_DEVICE
                                           cta_per_grid_dim,
                                           params.divmod_blk_major_
                                         );
-
+    if(params.divmod_splits_.divisor > 1) {
+      work_idx_l /= params.divmod_splits_.divisor;
+    }
     // Set the M, N, and L block offsets
     work_tile_info.M_idx = work_idx_m;
     work_tile_info.N_idx = work_idx_n;
     work_tile_info.L_idx = work_idx_l;
 
-    // if(linear_idx < 32 && ThreadIdxX() == 0)
-    //   printf("BlockID: %lu | k_tile_count: %d | M_idx: %lu | N_idx: %lu | K_idx: %lu | L_idx: %lu | ctas_per_grid_dim: %lu | output_tile_id: %lu\n",
+    // if(ThreadIdxX() == 0)
+    //   printf("BlockID: %lu | k_tile_count: %d | M_idx: %lu | N_idx: %lu | K_idx: %lu | L_idx: %lu | ctas_per_grid_dim: %lu | output_tile_id: %lu | unit_iter_start: %d | unit_iter_end: %d | tile_start: %d | tile_end: %d | split: %lu\n",
     //         BlockIdxY(), work_tile_info.k_tile_count, work_tile_info.M_idx, work_tile_info.N_idx, work_tile_info.K_idx,
-    //         work_tile_info.L_idx, remainder, output_tile_id);
+    //         work_tile_info.L_idx, remainder, output_tile_id, unit_iter_start, unit_iter_end, tile_iter_start, tile_iter_end, split);
   }
 
   // Returns the starting and ending peer ID of this tile

From 59b884f11cb004b80281e2ccd420ad05c808f6e5 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Mon, 2 Sep 2024 13:43:15 +0100
Subject: [PATCH 09/30] Minor cleanup * Need to fix splitK for batch > 1

---
 .../gemm/kernel/intel_pvc_gemm_streamk.hpp    |  47 +------
 ...rsistent_tile_scheduler_params_streamk.hpp |  23 +---
 .../intel_pvc_tile_scheduler_streamk.hpp      | 124 ++----------------
 3 files changed, 23 insertions(+), 171 deletions(-)

diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp
index 0f597a92cf..3f9424b0cb 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp
@@ -91,8 +91,6 @@ class GemmUniversal<
   using EpilogueArguments = typename CollectiveEpilogue::Arguments;
   using EpilogueParams = typename CollectiveEpilogue::Params;
 
-//   static_assert(ArchTag::kMinComputeCapability >= 90);
-
   using TileSchedulerTag = TileScheduler_;
   using TileScheduler = typename detail::TileSchedulerSelector<
     TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
@@ -109,13 +107,6 @@ class GemmUniversal<
 
   static constexpr int VecC = CollectiveMainloop::VecC;
 
-  /// Register requirement for Load and Math WGs
-//   static constexpr uint32_t LoadRegisterRequirement = 40;
-//   static constexpr uint32_t MmaRegisterRequirement = 232;
-
-  // 1 stage ordered sequence between mainloop and epilogue producer load threads
-//   using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
-
   // Kernel level shared memory storage
   struct SharedStorage {
     using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
@@ -173,28 +164,15 @@ class GemmUniversal<
 
     // Calculate workspace pointers
     uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
-    size_t workspace_offset = 0;
-
-    void* scheduler_workspace = workspace_ptr;
-    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
-      args.scheduler, args.problem_shape, args.hw_info);
-
-    void* epilogue_workspace = workspace_ptr + workspace_offset;
-    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
 
-    void* mainloop_workspace = nullptr;
-    // Precompute the sub tiles numbers in epilogue, pass into tile scheduler.  Therefore it will be used
-    // in separate reduction scheme for streamk case, NumEpilogueSubTiles default value is 1, which means
-    // subtile will not be used, therefore separate reduction will not be enabled.
-    // constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
     TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments(
-      problem_shape_MNKL, TileShape{}, hw_info, args.scheduler, scheduler_workspace);
+      problem_shape_MNKL, TileShape{}, hw_info, args.scheduler, workspace_ptr);
 
     return {
       args.mode,
       problem_shape,
-      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, mainloop_workspace),
-      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace),
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace_ptr),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace_ptr),
       hw_info,
       scheduler,
       workspace
@@ -211,10 +189,8 @@ class GemmUniversal<
   static size_t
   get_workspace_size(Arguments const& args) {
     size_t workspace_size = 0;
-
     workspace_size += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
       args.scheduler, args.problem_shape, args.hw_info);
-    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
     return workspace_size;
   }
 
@@ -223,21 +199,9 @@ class GemmUniversal<
     CudaHostAdapter* cuda_adapter = nullptr) {
     Status status = Status::kSuccess;
     uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
-    size_t workspace_offset = 0;
 
     status = TileScheduler::template initialize_workspace<ProblemShape, ElementAccumulator>(
-      args.scheduler, workspace_ptr + workspace_offset, args.problem_shape, args.hw_info);
-    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
-      args.scheduler, args.problem_shape, args.hw_info);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
-    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
-    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
-    if (status != Status::kSuccess) {
-      return status;
-    }
+      args.scheduler, workspace_ptr, args.problem_shape, args.hw_info);
 
     return status;
   }
@@ -246,7 +210,6 @@ class GemmUniversal<
   static dim3
   get_grid_shape(Params const& params) {
     // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
-    // TileSchedulerArguments args{};
     return TileScheduler::get_grid_shape(params.problem_shape, TileShape{}, params.hw_info);
   }
 
@@ -262,7 +225,6 @@ class GemmUniversal<
     using X = Underscore;
 
     // Preconditions
-    // static_assert(size(TiledMma{}) == 256, "Cooperative kernel must have TiledMMA operating using 256 threads.");
     static_assert(size<0>(TileShape{}) >= 128,
         "Cooperative kernel requires Tile Size to be greater than or equal to 128 along the M-dimension.");
 
@@ -305,6 +267,7 @@ class GemmUniversal<
 
     const int m_offset = sub_group_id / CollectiveMainloop::sg_per_wg_n * get<0>(subgroup_shape);
     const int n_offset = sub_group_id % CollectiveMainloop::sg_per_wg_n * get<1>(subgroup_shape);
+
     while (work_tile_info.is_valid()) {
       const int m_coord = work_tile_info.M_idx * get<0>(workgroup_shape) + m_offset;
       const int n_coord = work_tile_info.N_idx * get<1>(workgroup_shape) + n_offset;
diff --git a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp
index 5691059817..1201347fd8 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp
@@ -143,18 +143,6 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
   // ktile start from even for each cta
   uint32_t ktile_start_alignment_count { 1u };
 
-  // Returns the maximum number of peers that can collaborate on a given output tile
-  CUTLASS_HOST_DEVICE
-  static uint32_t
-  max_peers_per_tile(uint64_t sk_units, uint64_t sk_tiles) {
-    // When we can divide up our SK units to SK tiles evenly, the number of peers
-    // per SK tile is exactly (sk_units_ / sk_tiles_). In cases where this division
-    // is not exact, some tiles will need to be covered by additional SK units. Because
-    // the extra work can occur at both the beginning and the end of the SK tile, at
-    // most 2 extra peers will be needed.
-    return static_cast<uint32_t>(sk_units / sk_tiles + 2);
-  }
-
   // Initializes members. This variant of the method should only be used when
   // problem_shape and tile_shape contain modes of only rank 1.
   void
@@ -397,14 +385,17 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
   static dim3
   get_grid_shape(
     dim3 problem_blocks,
-    KernelHardwareInfo hw_info
+    KernelHardwareInfo hw_info,
+    bool truncate_range = true
   ) {
     uint32_t available_sms = 1 << find_log2(hw_info.sm_count / 8);
-    // printf("available_sms: %d\n", available_sms);
     auto possibly_truncate = [&](int x, int y) {
-      return static_cast<unsigned int>(platform::min(x, y));
+      if(truncate_range)
+        return static_cast<unsigned int>(platform::min(x, y));
+      else
+        return static_cast<unsigned int>(x);
     };
-    return dim3{1, possibly_truncate(available_sms, problem_blocks.x * problem_blocks.y * problem_blocks.z), 1};
+    return dim3{possibly_truncate(available_sms, problem_blocks.x * problem_blocks.y * problem_blocks.z), 1, 1};
   }
 
   // Returns the number of stream-K tiles that will be computed amongst `output_tiles` total
diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
index b496ed33aa..bc2e896e82 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
@@ -79,8 +79,7 @@ class PersistentTileSchedulerIntelPVCStreamK {
     CUTLASS_HOST_DEVICE
     bool
     is_valid() const {
-      // A work tile that computes no K tiles is invalid unless it is a separate-reduction work tile
-      // (which only performs reduction and epilogue)
+      // A work tile that computes no K tiles is invalid
       return k_tile_count > 0;
     }
 
@@ -132,9 +131,9 @@ class PersistentTileSchedulerIntelPVCStreamK {
     // The splitting factor to be used in a split-K decomposition of the problem.
     // If this is set to a value greater than 1, stream-K decomposition logic
     // is bypassed in favor of a split-K decomposition.
-    int splits = 1;
+    int splits = 16;
     ReductionMode reduction_mode = ReductionMode::Deterministic;
-    DecompositionMode decomposition_mode = DecompositionMode::StreamK;
+    DecompositionMode decomposition_mode = DecompositionMode::SplitK;
   };
 
   // Sink scheduler params as a member
@@ -185,8 +184,7 @@ class PersistentTileSchedulerIntelPVCStreamK {
 
   CUTLASS_HOST_DEVICE
   PersistentTileSchedulerIntelPVCStreamK(Params const& params_) : scheduler_params(params_) {
-    // current_work_linear_idx_ = uint64_t(BlockIdxX()) + uint64_t(BlockIdxY()) * uint64_t(GridDimX());
-    current_work_linear_idx_ = uint64_t(BlockIdxY());
+    current_work_linear_idx_ = uint64_t(BlockIdxX());
   }
 
   CUTLASS_DEVICE
@@ -403,7 +401,6 @@ template <int ThreadsPerBlock, class FrgTensorC>
     // `is_final_split` will be set to `true` for the following scenarios, all of which must compute the epilogue:
     //  1. The tile is computed in data-parallel mode
     //  2. The tile is computed in split-/stream-K mode and this work unit represents the final split of the tile
-    //  3. The tile is computed in split-/stream-K mode and separate reduction is used, and this is a separate reduction unit
     return work_tile_info.is_valid() &&
             work_tile_info.is_final_split(params.divmod_tiles_per_output_tile_.divisor);
   }
@@ -519,19 +516,7 @@ CUTLASS_DEVICE
     WorkTileInfo& work_tile_info) {
 
     uint64_t output_tile_id = linear_idx;
-    if (linear_idx >= params.units_per_problem_ * params.divmod_splits_.divisor) {
-      // Separate-reduction work
-      /*auto cluster_size = params.get_cluster_size();
-      // Divide up the linearized separate reduction units into clusters
-      auto cluster_linear_reduction_unit_idx = params.div_cluster_size((linear_idx - params.units_per_problem_));
-      uint64_t cluster_tile_idx, epi_subtile_idx;
-      params.divmod_epilogue_subtile_(cluster_tile_idx, epi_subtile_idx, cluster_linear_reduction_unit_idx);
-      // Bring the linearized tile ID back into the space of tiles, rather than clusters
-      output_tile_id = cluster_tile_idx * cluster_size;
-
-      work_tile_info.setup_separate_reduction(epi_subtile_idx);*/
-    }
-    else if (linear_idx >= params.sk_units_ && params.divmod_splits_.divisor == 1) {
+    if (linear_idx >= params.sk_units_ && params.divmod_splits_.divisor == 1) {
       // Data-parallel work
       output_tile_id = linear_idx - params.sk_units_ + params.sk_tiles_;
       work_tile_info.K_idx = 0;
@@ -539,38 +524,10 @@ CUTLASS_DEVICE
       work_tile_info.k_tile_remaining = params.divmod_tiles_per_output_tile_.divisor;
     }
     else {
-      // In the CUTLASS 2.x implementation of stream K, stream-K work is assigned to each stream-K
-      // threadblock individually. For the most part, the set of K iterations corresponding to stream-K
-      // work was divided amongst stream-K threadblocks, and a threadblock determined which tile
-      // it would compute a (potentially-partial) output tile for based on the space of k iterations
-      // assigned to it. This often results in stream-K threadblocks processing tiles with different
-      // offsets in the K dimension from one another. This can reduce locality, but is lmitied to the
-      // (generally few) waves of threadblocks assigned to compute stream-K work.
-      //
-      // With the introduction of threadblock clusters, there is additional benefit to maintaining
-      // locality in the K dimension: shared portions of operands can be multicasted to threadblocks
-      // within a cluster. Thus, we would like to ensure that the assignment of stream-K work to
-      // threadblocks respects the ability to perform multicasting.
-      //
-      // To do so, we divide up the linearized stream-K units into clusters and share the same K
-      // offsets for work within clusters.
-
-      // auto cluster_linear_work_idx = params.div_cluster_size(linear_idx);
-
-      // uint64_t group_idx;
-      // params.divmod_sk_groups_(cluster_linear_work_idx, group_idx, cluster_linear_work_idx);
-
-      // // Determine whether we are in a "big group" that will process an additional
-      // // stream-K cluster tile.
-      // auto sk_cluster_tiles = params.div_cluster_size(params.sk_tiles_);
-      // auto sk_tiles_in_group = params.divmod_sk_groups_.divide(params.sk_tiles_);
-      // if (group_idx < params.big_groups_) {
-      //   ++sk_cluster_tiles_in_group;
-      // }
 
-      // // Determine whether we are in a "big unit" within the group, that will process
-      // // an additional K chunk in the group.
-      auto sk_tiles_in_group = params.sk_tiles_;//sk_cluster_tiles_in_group * params.get_cluster_size();
+      // Determine whether we are in a "big unit" within the group, that will process
+      // an additional K chunk in the group.
+      auto sk_tiles_in_group = params.sk_tiles_;
       auto k_tiles_in_group = sk_tiles_in_group * params.divmod_tiles_per_output_tile_.divisor;
       auto k_tiles_per_unit_in_group = params.divmod_sk_units_per_group_.divide(k_tiles_in_group);
       // auto big_units_in_group = params.div_cluster_size(
@@ -696,17 +653,6 @@ CUTLASS_DEVICE
       // overall linearized space.
       output_tile_id = output_tile_id_in_group * params.divmod_sk_groups_.divisor;
 
-      // Bring the linearized tile ID back into the space of tiles, rather than clusters
-      // output_tile_id *= params.get_cluster_size();
-
-      // The final linearized tile ID is in units of the cluster dimension over which we rasterize.
-      // if (params.raster_order_ == RasterOrder::AlongN) {
-      //   output_tile_id += cta_n_in_cluster * params.divmod_cluster_shape_minor_.divisor;
-      // }
-      // else {
-      //   output_tile_id += cta_m_in_cluster * params.divmod_cluster_shape_minor_.divisor;
-      // }
-
       // The unit's starting k iteration in the current tile is either the starting
       // iteration for the tile as a whole, or the starting k iteration for the unit
       // as a whole (if the latter is greater than the former).
@@ -740,59 +686,11 @@ CUTLASS_DEVICE
     work_tile_info.L_idx = work_idx_l;
 
     // if(ThreadIdxX() == 0)
-    //   printf("BlockID: %lu | k_tile_count: %d | M_idx: %lu | N_idx: %lu | K_idx: %lu | L_idx: %lu | ctas_per_grid_dim: %lu | output_tile_id: %lu | unit_iter_start: %d | unit_iter_end: %d | tile_start: %d | tile_end: %d | split: %lu\n",
-    //         BlockIdxY(), work_tile_info.k_tile_count, work_tile_info.M_idx, work_tile_info.N_idx, work_tile_info.K_idx,
-    //         work_tile_info.L_idx, remainder, output_tile_id, unit_iter_start, unit_iter_end, tile_iter_start, tile_iter_end, split);
+    //   printf("BlockID: %lu | k_tile_count: %d | M_idx: %lu | N_idx: %lu | K_idx: %lu | L_idx: %lu | ctas_per_grid_dim: %lu | output_tile_id: %lu\n",
+    //         BlockIdxX(), work_tile_info.k_tile_count, work_tile_info.M_idx, work_tile_info.N_idx, work_tile_info.K_idx,
+    //         work_tile_info.L_idx, remainder, output_tile_id);
   }
 
-  // Returns the starting and ending peer ID of this tile
-  CUTLASS_HOST_DEVICE
-  static auto
-  tile_peer_range(Params const& params, uint32_t tile_idx, uint32_t cur_k_tile) {
-    auto tile_idx_in_cluster_path = tile_idx;
-    auto start_k_tile = params.divmod_tiles_per_output_tile_.divisor * tile_idx_in_cluster_path;
-    auto end_k_tile = start_k_tile + params.divmod_tiles_per_output_tile_.divisor - 1;
-    auto big_unit_k_tiles = params.big_units_ * (params.divmod_k_tiles_per_sk_unit_.divisor + 1);
-
-    auto adjust_unit = [&](uint32_t k_tile, uint32_t unit_idx, uint32_t k_tiles_per_unit) {
-      auto unit_k_start = unit_idx * k_tiles_per_unit;
-      auto unit_k_end = unit_k_start + k_tiles_per_unit;
-      if (k_tile - start_k_tile < Params::min_iters_per_sk_unit_ &&
-          unit_k_end - start_k_tile < Params::min_iters_per_sk_unit_) {
-        // k_tile is within the first min_iters_per_sk_unit_ K tiles of this output tile,
-        // and the stream-K unit computes fewer than min_iters_per_sk_unit_ K tiles for this
-        // output tile. This work will thus be subsumed by the next stream-K unit.
-        ++unit_idx;
-      }
-
-      if (end_k_tile + 1 - k_tile < Params::min_iters_per_sk_unit_ &&
-          end_k_tile + 1 - unit_k_start < Params::min_iters_per_sk_unit_) {
-        // k_tile is within the last min_iters_per_sk_unit_ K tiles of this output tile,
-        // and the stream-K unit computes fewer than min_iters_per_sk_unit_ K tiles for this
-        // output tile. This work will thus be subsumed by the previous stream-K unit.
-        --unit_idx;
-      }
-
-      return unit_idx;
-    };
-
-    // Lambda to find the ID of the stream-K unit that computes this K tile
-    auto find_unit = [&](uint32_t k_tile) {
-      if (k_tile < big_unit_k_tiles) {
-        // The tile is within the "big unit range"
-        auto unit_idx = params.divmod_k_tiles_per_sk_big_unit_.divide(k_tile);
-        return static_cast<uint64_t>(adjust_unit(k_tile, unit_idx, params.divmod_k_tiles_per_sk_big_unit_.divisor));
-      }
-      else {
-        // The tile is after the "big unit range." Account for this by finding the "normal unit"
-        // that it belongs to, and then offsetting by the number of big units
-        auto unit_idx = params.divmod_k_tiles_per_sk_unit_.divide(k_tile - big_unit_k_tiles) + params.big_units_;
-        return static_cast<uint64_t>(adjust_unit(k_tile, unit_idx, params.divmod_k_tiles_per_sk_unit_.divisor));
-      }
-    };
-
-    return cute::make_tuple(find_unit(start_k_tile), find_unit(cur_k_tile), find_unit(end_k_tile));
-  }
 };
 
 } // namespace cutlass::gemm::kernel::detail

From 4e9f9c3e7888adba9b731bea9d242020cd41cb80 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Mon, 2 Sep 2024 15:46:32 +0100
Subject: [PATCH 10/30] Fixed splitK for batch > 1

---
 examples/sycl/pvc/pvc_gemm.cpp                      |  5 ++++-
 .../kernel/intel_pvc_tile_scheduler_streamk.hpp     | 13 ++++++++-----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/examples/sycl/pvc/pvc_gemm.cpp b/examples/sycl/pvc/pvc_gemm.cpp
index 3cd69255eb..9224c2c071 100644
--- a/examples/sycl/pvc/pvc_gemm.cpp
+++ b/examples/sycl/pvc/pvc_gemm.cpp
@@ -47,6 +47,7 @@
 #include "cutlass/util/reference/device/tensor_compare.h"
 #include "common.h"
 
+#include "cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp"
 using namespace cute;
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -219,7 +220,9 @@ struct ExampleRunner {
       problem_size,
       {block_A.get(), stride_A, block_B.get(), stride_B},
       {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
-      hw_info
+      hw_info,
+      {1, 
+      cutlass::gemm::kernel::detail::PersistentTileSchedulerIntelPVCStreamKParams::DecompositionMode::StreamK}
     };
 
     Gemm gemm_op;
diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
index bc2e896e82..8d33a76c8c 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
@@ -131,9 +131,9 @@ class PersistentTileSchedulerIntelPVCStreamK {
     // The splitting factor to be used in a split-K decomposition of the problem.
     // If this is set to a value greater than 1, stream-K decomposition logic
     // is bypassed in favor of a split-K decomposition.
-    int splits = 16;
+    int splits = 1;
     ReductionMode reduction_mode = ReductionMode::Deterministic;
-    DecompositionMode decomposition_mode = DecompositionMode::SplitK;
+    DecompositionMode decomposition_mode = DecompositionMode::Heuristic;
   };
 
   // Sink scheduler params as a member
@@ -669,6 +669,11 @@ CUTLASS_DEVICE
     }
 
     uint64_t work_idx_l, remainder;
+
+    if(params.divmod_splits_.divisor > 1) {
+      output_tile_id %= params.units_per_problem_;
+    }
+
     params.divmod_batch_(work_idx_l, remainder, output_tile_id);
 
     uint64_t cta_per_grid_dim = remainder; //params.divmod_cluster_shape_minor_.divide(remainder);
@@ -677,9 +682,7 @@ CUTLASS_DEVICE
                                           cta_per_grid_dim,
                                           params.divmod_blk_major_
                                         );
-    if(params.divmod_splits_.divisor > 1) {
-      work_idx_l /= params.divmod_splits_.divisor;
-    }
+
     // Set the M, N, and L block offsets
     work_tile_info.M_idx = work_idx_m;
     work_tile_info.N_idx = work_idx_n;

From 345dcaeb0c9b39bdcced6d6b7294cd9fe9377736 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Mon, 2 Sep 2024 17:10:12 +0100
Subject: [PATCH 11/30] Re-enabled GEMM Universal Adater specialization

---
 include/cutlass/gemm/device/gemm_universal_adapter.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/include/cutlass/gemm/device/gemm_universal_adapter.h b/include/cutlass/gemm/device/gemm_universal_adapter.h
index e1bf6ee2d6..40a21b1078 100644
--- a/include/cutlass/gemm/device/gemm_universal_adapter.h
+++ b/include/cutlass/gemm/device/gemm_universal_adapter.h
@@ -91,8 +91,7 @@ class GemmUniversalAdapter;
 template <class GemmKernel_>
 class GemmUniversalAdapter<
   GemmKernel_,
-  // cute::enable_if_t<gemm::detail::IsCutlass3GemmKernel<GemmKernel_>::value>>
-  cute::enable_if_t<true>>
+  cute::enable_if_t<gemm::detail::IsCutlass3GemmKernel<GemmKernel_>::value>>
 {
 public:
   using GemmKernel = GemmKernel_;
@@ -506,7 +505,7 @@ class GemmUniversalAdapter<
 ////////////////////////////// CUTLASS 2.x API /////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////
 
-/*template <class GemmKernel_>
+template <class GemmKernel_>
 class GemmUniversalAdapter<
   GemmKernel_,
   cute::enable_if_t<not gemm::detail::IsCutlass3GemmKernel<GemmKernel_>::value>>
@@ -667,7 +666,7 @@ class GemmUniversalAdapter<
 
     return status;
   }
-};*/
+};
 
 ////////////////////////////////////////////////////////////////////////////////
 

From 05b487aedd20664653aa3c0fe51b5d1f95f94b88 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Mon, 2 Sep 2024 17:14:45 +0100
Subject: [PATCH 12/30] Update split barrier arguments

---
 include/cutlass/arch/barrier.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/cutlass/arch/barrier.h b/include/cutlass/arch/barrier.h
index 0ac6da8813..756f21971d 100644
--- a/include/cutlass/arch/barrier.h
+++ b/include/cutlass/arch/barrier.h
@@ -156,8 +156,8 @@ class NamedBarrier {
   CUTLASS_DEVICE
   static void arrive_and_wait_internal(uint32_t num_threads, uint32_t barrier_id) {
 #if defined SYCL_INTEL_TARGET
-    __spirv_ControlBarrierArriveINTEL(2, 2, 0x4);
-    __spirv_ControlBarrierWaitINTEL(2, 2, 0x2);
+    __spirv_ControlBarrierArriveINTEL(2, 2, 0x0);
+    __spirv_ControlBarrierWaitINTEL(2, 2, 0x0);
 #elif defined CUDA_BARRIER_ENABLED
     asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "r"(num_threads));
 #elif defined(__CUDA_ARCH__)
@@ -168,7 +168,7 @@ class NamedBarrier {
   CUTLASS_DEVICE
   static void arrive_internal(uint32_t num_threads, uint32_t barrier_id) {
 #if defined SYCL_INTEL_TARGET
-    __spirv_ControlBarrierArriveINTEL(2, 2, 0x4);
+    __spirv_ControlBarrierArriveINTEL(2, 2, 0x0);
 #elif CUDA_BARRIER_ENABLED
     asm volatile("bar.arrive %0, %1;" : : "r"(barrier_id), "r"(num_threads));
 #elif defined(__CUDA_ARCH__)

From bff4801c33f00b6a36668f2d7f042826abd1a4ee Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Mon, 2 Sep 2024 17:28:10 +0100
Subject: [PATCH 13/30] Minor cleanup

---
 .../gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp     | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
index 8d33a76c8c..578eaa09cf 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
@@ -339,7 +339,7 @@ template <int ThreadsPerBlock, class FrgTensorC>
 
     // The number of tiles for which reduction is required is either:
     //   (a) the total number of output tiles (in the case of split-K)
-    //   (b) the number of stream-K tiles (potentially multiplied by peer count if using separate reduction)
+    //   (b) the number of stream-K tiles
     // To calculate the total number of output tiles in the split-K case, we
     // note that, in the split-K case, the units_per_problem_ member of Params will be
     // the total number of output tiles.
@@ -358,8 +358,7 @@ template <int ThreadsPerBlock, class FrgTensorC>
 
     if (!compute_epilogue(work_tile_info, params)) {
       if (work_tile_info.K_idx == 0) {
-        // The first peer initializes the workspace partials in the non-separate-reduction case,
-        // and all peers write to their own location in workspace when using separate reduction
+        // The first peer initializes the workspace partials
         BlockStripedReduceT::store(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx);
       }
       else {
@@ -370,8 +369,8 @@ template <int ThreadsPerBlock, class FrgTensorC>
         BlockStripedReduceT::reduce(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx);
       }
 
-      // If separate reduction is being performed, each participating stream-K unit increments the barrier
-      // by only 1. Otherwise, increment by the K tile count that this unit has processed.
+      // Each participating stream-K unit increments the barrier by the K tile count that this unit has
+      // processed.
       int32_t increment = work_tile_info.k_tile_count;
 
       // Signal our arrival

From 3300bf737bf737b98ddb446f4c8f713c0b1e7025 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Tue, 3 Sep 2024 11:14:23 +0100
Subject: [PATCH 14/30] Changed initialization to workspace only

---
 examples/sycl/pvc/pvc_gemm.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/sycl/pvc/pvc_gemm.cpp b/examples/sycl/pvc/pvc_gemm.cpp
index 9224c2c071..18e2b48c10 100644
--- a/examples/sycl/pvc/pvc_gemm.cpp
+++ b/examples/sycl/pvc/pvc_gemm.cpp
@@ -221,8 +221,8 @@ struct ExampleRunner {
       {block_A.get(), stride_A, block_B.get(), stride_B},
       {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
       hw_info,
-      {1, 
-      cutlass::gemm::kernel::detail::PersistentTileSchedulerIntelPVCStreamKParams::DecompositionMode::StreamK}
+      {16, 
+      cutlass::gemm::kernel::detail::PersistentTileSchedulerIntelPVCStreamKParams::DecompositionMode::SplitK}
     };
 
     Gemm gemm_op;
@@ -248,7 +248,7 @@ struct ExampleRunner {
       timer.start();
       for (int i = 0; i < options.iterations; ++i) {
         if(workspace_size > 0)
-          gemm_op.initialize(arguments, workspace.get());
+          Gemm::GemmKernel::initialize_workspace(arguments, workspace.get());
         gemm_op.run();
       }
       syclcompat::wait();

From f559e31558255128b5a546533a6816913813bef9 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Tue, 3 Sep 2024 14:05:49 +0100
Subject: [PATCH 15/30] Fix CI failure

---
 include/cutlass/barrier.h                      | 4 ++--
 include/cutlass/gemm/kernel/tile_scheduler.hpp | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/cutlass/barrier.h b/include/cutlass/barrier.h
index d5dfaf4a63..1cfc73c1cb 100644
--- a/include/cutlass/barrier.h
+++ b/include/cutlass/barrier.h
@@ -97,12 +97,12 @@ struct GenericBarrier {
   {
     int state = 0;
 
-#ifdef SYCL_INTEL_TARGET
+#if defined (SYCL_INTEL_TARGET)
     auto atm = sycl::atomic_ref<int, sycl::memory_order::acq_rel, 
                                      sycl::memory_scope::device, 
                                      sycl::access::address_space::global_space>(*ptr);
     return atm.load(sycl::memory_order::acquire);
-#elif defined (__CUDA_ARCH__ >= 700)
+#elif (__CUDA_ARCH__ >= 700)
     /// SM70 and newer use memory consistency qualifiers
 
     // Acquire pattern using acquire modifier
diff --git a/include/cutlass/gemm/kernel/tile_scheduler.hpp b/include/cutlass/gemm/kernel/tile_scheduler.hpp
index e354e7a48c..524884ed6f 100644
--- a/include/cutlass/gemm/kernel/tile_scheduler.hpp
+++ b/include/cutlass/gemm/kernel/tile_scheduler.hpp
@@ -130,6 +130,7 @@ struct TileSchedulerSelector<
   using Scheduler = PersistentTileSchedulerSm90StreamK<TileShape, ClusterShape>;
 };
 
+#if defined (SYCL_INTEL_TARGET)
 template <
   class TileShape,
   class ClusterShape
@@ -142,6 +143,7 @@ struct TileSchedulerSelector<
   > {
   using Scheduler = PersistentTileSchedulerIntelPVCStreamK<TileShape>;
 };
+#endif
 
 template <
   class TileShape,

From bcf812e7bf3be9af957e8b972219921b47dff742 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Wed, 4 Sep 2024 12:04:25 +0100
Subject: [PATCH 16/30] Added support for scheduling non-uniform tiles

---
 .../intel_pvc_tile_scheduler_streamk.hpp      | 31 +++++++++----------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
index 578eaa09cf..554724c39b 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
@@ -529,15 +529,14 @@ CUTLASS_DEVICE
       auto sk_tiles_in_group = params.sk_tiles_;
       auto k_tiles_in_group = sk_tiles_in_group * params.divmod_tiles_per_output_tile_.divisor;
       auto k_tiles_per_unit_in_group = params.divmod_sk_units_per_group_.divide(k_tiles_in_group);
-      // auto big_units_in_group = params.div_cluster_size(
-      //   k_tiles_in_group - (k_tiles_per_unit_in_group * params.divmod_sk_units_per_group_.divisor));
+      auto big_units_in_group = k_tiles_in_group - (k_tiles_per_unit_in_group * params.divmod_sk_units_per_group_.divisor);
 
       uint64_t split;
       params.divmod_sk_units_per_group_(split, output_tile_id, output_tile_id);
 
       bool is_split_k = params.divmod_splits_.divisor > 1;
-      auto big_unit_cmp_lhs = output_tile_id;
-      auto big_unit_cmp_rhs = /*is_split_k ? */params.big_units_;// : big_units_in_group;
+      auto big_unit_cmp_lhs = is_split_k ? split : output_tile_id;
+      auto big_unit_cmp_rhs = is_split_k ? params.big_units_ : big_units_in_group;
       auto linear_idx_mult = is_split_k ? params.divmod_tiles_per_output_tile_.divisor : k_tiles_per_unit_in_group;
       auto k_tiles_per_split = is_split_k ? params.divmod_k_tiles_per_sk_unit_.divisor : k_tiles_per_unit_in_group;
 
@@ -548,18 +547,18 @@ CUTLASS_DEVICE
       // compute one extra iteration. If there are any big units, they will be the first
       // in the linearized ID space.
       auto k_tiles_in_my_split = k_tiles_per_split;
-      // if (big_unit_cmp_lhs < big_unit_cmp_rhs) {
-      //   // Since the "big units" are the first units in the linearized ID space, each
-      //   // of the units preceding this big unit computed one extra iteration. Thus,
-      //   // we must offset our start iteration by the number of units that precede
-      //   // the current unit in the linearized ID space.
-      //   unit_iter_start += big_unit_cmp_lhs;
-      //   ++k_tiles_in_my_split;
-      // }
-      // else {
-      //   // Increment by one for each of the big clusters (since all big units precede this unit)
-      //   unit_iter_start += big_unit_cmp_rhs;
-      // }
+      if (big_unit_cmp_lhs < big_unit_cmp_rhs) {
+        // Since the "big units" are the first units in the linearized ID space, each
+        // of the units preceding this big unit computed one extra iteration. Thus,
+        // we must offset our start iteration by the number of units that precede
+        // the current unit in the linearized ID space.
+        unit_iter_start += big_unit_cmp_lhs;
+        ++k_tiles_in_my_split;
+      }
+      else {
+        // Increment by one for each of the big clusters (since all big units precede this unit)
+        unit_iter_start += big_unit_cmp_rhs;
+      }
 
       if (!is_split_k) {
         // Adjust the unit starting position and number of tiles to avoid

From 1544d51af38389cb5a810c3f6acecb5b69c05f52 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Wed, 4 Sep 2024 12:09:00 +0100
Subject: [PATCH 17/30] Only include split barrier flags for PVC

---
 cmake/FindDPCPP.cmake | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake
index a18eda051c..ba60ed2f8b 100644
--- a/cmake/FindDPCPP.cmake
+++ b/cmake/FindDPCPP.cmake
@@ -43,7 +43,6 @@ set(DPCPP_COMPILE_ONLY_FLAGS "")
 
 if(NOT "${DPCPP_SYCL_TARGET}" STREQUAL "")
   list(APPEND DPCPP_FLAGS "-fsycl-targets=${DPCPP_SYCL_TARGET};")
-  list(APPEND DPCPP_FLAGS "-Xspirv-translator;--spirv-ext=+SPV_INTEL_split_barrier;-DSPV_INTEL_split_barrier;")
 endif()
 
 if(NOT "${DPCPP_USER_FLAGS}" STREQUAL "")
@@ -58,6 +57,10 @@ if(NOT "${DPCPP_SYCL_ARCH}" STREQUAL "")
   endif()
 endif()
 
+if("${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_pvc")
+  list(APPEND DPCPP_FLAGS "-Xspirv-translator;--spirv-ext=+SPV_INTEL_split_barrier;-DSPV_INTEL_split_barrier;")
+endif()
+
 if(UNIX)
   set_target_properties(DPCPP::DPCPP PROPERTIES
     INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS};${DPCPP_COMPILE_ONLY_FLAGS}"

From 2ab1cd81c9be5c8de4ed9152edc2f1609bd69918 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Wed, 4 Sep 2024 12:44:28 +0100
Subject: [PATCH 18/30] Test

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 64cf206671..b63296b2e4 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -401,6 +401,8 @@ endif()
 
 # Warnings-as-error exceptions and warning suppressions for Clang builds
 if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=unused-command-line-argument ")
+  list(APPEND CUTLASS_CUDA_NVCC_FLAGS "-Wno-error=unused-command-line-argument")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=implicit-int-conversion ")
   list(APPEND CUTLASS_CUDA_NVCC_FLAGS "-Wno-error=implicit-int-conversion" )
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pass-failed ")

From 750ee3a7d8694a3075a236581e61016aeb57b71b Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Wed, 4 Sep 2024 14:32:45 +0100
Subject: [PATCH 19/30] Code cleanup

---
 .../gemm/kernel/intel_pvc_gemm_streamk.hpp    |   6 +-
 ...rsistent_tile_scheduler_params_streamk.hpp | 107 ++++++++----------
 .../intel_pvc_tile_scheduler_streamk.hpp      |  19 ++--
 3 files changed, 54 insertions(+), 78 deletions(-)

diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp
index 3f9424b0cb..f699a8fd6b 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp
@@ -221,12 +221,8 @@ class GemmUniversal<
   CUTLASS_DEVICE
   void
   operator()(Params const& params, char* smem_buf) {
-    using namespace cute;
-    using X = Underscore;
-
     // Preconditions
-    static_assert(size<0>(TileShape{}) >= 128,
-        "Cooperative kernel requires Tile Size to be greater than or equal to 128 along the M-dimension.");
+    CUTE_STATIC_ASSERT(is_static<WorkgroupTileShape>::value);
 
     static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
     static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
diff --git a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp
index 1201347fd8..85fcfdbefb 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp
@@ -51,18 +51,18 @@ namespace detail {
 // Parameters for Intel PVC persistent stream-K scheduler
 struct PersistentTileSchedulerIntelPVCStreamKParams {
 
-  // Strategies for computing reductions between CTAs computing portions of a given output tile
+  // Strategies for computing reductions between work-groups computing portions of a given output tile
   enum class ReductionMode {
-    // Participating CTAs perform reduction in a turnstile fashion in order of the K extent
-    // covered by each CTA. This requires a lock to be held exclusively be the CTA that is
+    // Participating work-groups perform reduction in a turnstile fashion in order of the K extent
+    // covered by each work-group. This requires a lock to be held exclusively be the work-group that is
     // currently accumulating.
     //
     // Turnstile accumulation ensures deterministic numeric behavior when using this mode.
     Deterministic,
 
-    // Participating CTAs perform reduction atomically to the same workspace (mostly) without locking.
-    // Locks are used only to wait for the first CTA to write its partial values (to initialize the
-    // workspace), and for all but the final CTA to have accumulated (so that the final CTA can load
+    // Participating work-groups perform reduction atomically to the same workspace (mostly) without locking.
+    // Locks are used only to wait for the first work-group to write its partial values (to initialize the
+    // workspace), and for all but the final work-group to have accumulated (so that the final work-group can load
     // the accumulated value and accumulate it into registers on top of which the epilogue will
     // be performed).
     //
@@ -87,12 +87,8 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
   FastDivmodU64 divmod_batch_{};
   FastDivmodU64 divmod_blk_major_{};
 
-  // We divide up the number of stream-K tiles amongst G groups of stream-K units.
-  // The stream-K units within a group collaborate to comptue over the `sk_tiles / G`
-  // tiles assigned to that group. Non-unit group sizes can help to preserve L2 locality of
-  // partial chunks computed by stream-K units -- units 0 in each group will compute identical K extents
-  // of tiles that would be assigned in the same wave according to the rasterization order of the
-  // data-parallel formulation of the problem.
+  // Divide up the number of stream-K tiles amongst G groups of stream-K units.
+  // Currently defaults to 1 since we don't create groups for PVC.
   FastDivmodU64 divmod_sk_groups_{};
 
   // Number of stream-K units in each group
@@ -108,11 +104,9 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
 
   // Number of stream-K or split-K work units that compute an extra k iteration.
   // This is done to handle residuals in dividing up the k iteration space.
-  // For stream-K, since the actual assignment of work to stream-K units will be done
-  // at the granularity of a cluster, we store only the number of big clusters.
   uint32_t big_units_ = 0;
 
-  // The number of groups of stream-K units that will process an extra stream-K tile cluster.
+  // The number of groups of stream-K units that will process an extra stream-K tile.
   uint32_t big_groups_ = 0;
 
   // Workspace for holding partial accumulators to be reduced across stream-K/split-K units
@@ -131,7 +125,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
   // processes one more K chunk than a "normal" stream-K unit.
   FastDivmod divmod_k_tiles_per_sk_big_unit_{};
 
-  // Strategy to use when reducing between collaborating CTAs
+  // Strategy to use when reducing between collaborating work-groups
   ReductionMode reduction_mode_ = ReductionMode::Deterministic;
 
   // Minimum number of k tiles that can be assigned to a stream-K unit
@@ -156,7 +150,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
     void* workspace
   ) {
 
-    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, tile_shape);
+    dim3 problem_blocks = get_tiled_wg_shape_mnl(problem_shape, tile_shape);
     // Number of k tiles in each output tile
     uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
 
@@ -171,8 +165,8 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
     );
   }
 
-  // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions.
-  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // Version of initialize that takes in as input the number of work-groups in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or work-group shape has rank > 1,
   // for which using CuTe algebra for calculating tile shapes is easiest.
   void
   initialize(
@@ -232,18 +226,17 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
       return;
     }
 
-    // Calculate the maximum number of blocks from clusters of shape cluster_shape that we
-    // can fit within sm_count SMs.
+    // Calculate the maximum number of blocks that we can fit within sm_count SMs.
     dim3 grid = get_grid_shape(
       problem_blocks,
       hw_info
     );
 
-    uint64_t ctas_per_wave = grid.x * grid.y;
+    uint64_t wgs_per_wave = grid.x * grid.y;
     // The number of output tiles to be computed in stream-K and data-parallel fashion, respectively.
     uint32_t sk_tiles = get_num_sk_tiles(
       output_tiles,
-      ctas_per_wave,
+      wgs_per_wave,
       k_tiles_per_output_tile,
       decomposition_mode
     );
@@ -259,8 +252,8 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
     // is needed per data-parallel tile.
     uint64_t dp_units = dp_tiles;
 
-    uint64_t ctas_per_sk_wave = ctas_per_wave;
-    uint64_t sk_units = get_num_sk_units(ctas_per_sk_wave, sk_tiles, k_tiles_per_output_tile);
+    uint64_t wgs_per_sk_wave = wgs_per_wave;
+    uint64_t sk_units = get_num_sk_units(wgs_per_sk_wave, sk_tiles, k_tiles_per_output_tile);
 
     if (decomposition_mode == DecompositionMode::DataParallel ||
         (decomposition_mode == DecompositionMode::Heuristic && sk_tiles == 0) ||
@@ -282,13 +275,9 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
 
     auto sk_units_per_group = sk_units / groups;
 
-    // sk_tiles is guaranteed to be divisible by cluster_size because it is calculated as:
-    //    sk_tiles = (waves <= 2) ? total_tiles : (sm_count + (total_tiles % sm_count))
-    // Both total_tiles and sm_count are multiples of cluster size due to padding added
-    // prior to kernel launch.
     uint64_t sk_tiles_per_group = sk_tiles / groups;
 
-    // Groups that will process an extra stream-K tile cluster. These differ from "big_units," which
+    // Groups that will process an extra stream-K tile. These differ from "big_units," which
     // are stream-K units within a group that process an extra K chunk.
     uint64_t sk_big_groups = sk_tiles % groups;
 
@@ -345,10 +334,10 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
   cute::tuple<int32_t, int32_t>
   get_work_idx_m_and_n(
       uint64_t blk_per_grid_dim,
-      FastDivmodU64 const& divmod_cluster_blk_major) {
+      FastDivmodU64 const& divmod_blk_major) {
 
     uint64_t m_idx, n_idx;
-    divmod_cluster_blk_major(m_idx, n_idx, blk_per_grid_dim);
+    divmod_blk_major(m_idx, n_idx, blk_per_grid_dim);
     auto i = static_cast<int32_t>(m_idx);
     auto j = static_cast<int32_t>(n_idx);
 
@@ -362,15 +351,15 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
   get_linear_idx_from_m_and_n(
     int32_t tile_m,
     int32_t tile_n,
-    FastDivmodU64 const& divmod_cluster_blk_major) {
-    return static_cast<uint64_t>(tile_m * divmod_cluster_blk_major.divisor + tile_n);
+    FastDivmodU64 const& divmod_blk_major) {
+    return static_cast<uint64_t>(tile_m * divmod_blk_major.divisor + tile_n);
   }
 
-  // Get the number of CTA tiles in this problem. This variant of the method should only be used when
+  // Get the number of work-group tiles in this problem. This variant of the method should only be used when
   // problem_shape and tile_shape contain modes of only rank 1.
   CUTLASS_HOST_DEVICE
   static dim3
-  get_tiled_cta_shape_mnl(BatchedGemmCoord problem_shape, GemmCoord cta_shape) {
+  get_tiled_wg_shape_mnl(BatchedGemmCoord problem_shape, GemmCoord cta_shape) {
     auto cta_m = (problem_shape.m() + cta_shape.m() - 1) / cta_shape.m();
     auto cta_n = (problem_shape.n() + cta_shape.n() - 1) / cta_shape.n();
 
@@ -399,16 +388,16 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
   }
 
   // Returns the number of stream-K tiles that will be computed amongst `output_tiles` total
-  // output tiles on a device with `ctas_per_wave` CTAs in each wave.
+  // output tiles on a device with `wgs_per_wave` work-groups in each wave.
   static uint32_t
   get_num_sk_tiles(
     uint64_t output_tiles,
-    uint64_t ctas_per_wave,
+    uint64_t wgs_per_wave,
     uint32_t k_tiles_per_output_tile,
     DecompositionMode decomposition_mode
   ) {
-    uint32_t full_waves = static_cast<uint32_t>(output_tiles / ctas_per_wave);
-    uint32_t total_waves = static_cast<uint32_t>((output_tiles + ctas_per_wave - 1) / ctas_per_wave);
+    uint32_t full_waves = static_cast<uint32_t>(output_tiles / wgs_per_wave);
+    uint32_t total_waves = static_cast<uint32_t>((output_tiles + wgs_per_wave - 1) / wgs_per_wave);
 
     if (decomposition_mode == DecompositionMode::DataParallel ||
         decomposition_mode == DecompositionMode::SplitK) {
@@ -420,7 +409,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
     // that full_waves == total_waves - 1 in this case, the number of data-parallel
     // waves is simply full_waves-1 (unless full_waves == 0).
     uint32_t dp_waves = full_waves > 1 ? full_waves - 1 : 0;
-    uint64_t dp_tiles = dp_waves * ctas_per_wave;
+    uint64_t dp_tiles = dp_waves * wgs_per_wave;
     uint64_t sk_tiles = output_tiles - dp_tiles;
 
     if (decomposition_mode == DecompositionMode::Heuristic) {
@@ -436,8 +425,8 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
 
       // Rudimentary heuristic: prefer data-parallel decomposition if we have more than
       // one wave and the tail wave is more than half full. This is subject to change.
-      uint64_t tail_tiles = output_tiles - (full_waves * ctas_per_wave);
-      if (2 * tail_tiles >= ctas_per_wave) {
+      uint64_t tail_tiles = output_tiles - (full_waves * wgs_per_wave);
+      if (2 * tail_tiles >= wgs_per_wave) {
         return 0;
       }
     }
@@ -447,7 +436,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
 
   CUTLASS_HOST_DEVICE
   static uint64_t
-  get_num_sk_units(uint64_t ctas_per_sk_wave, uint32_t sk_tiles, uint32_t k_tiles_per_output_tile) {
+  get_num_sk_units(uint64_t wgs_per_sk_wave, uint32_t sk_tiles, uint32_t k_tiles_per_output_tile) {
     // If there are stream-K tiles to compute and a sufficiently large number of k iterations
     // across them, they will be covered by a single wave of persistent threadblocks. Thus, there
     // will be as many work units as there are threadblocks in a single wave.
@@ -457,16 +446,16 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
     // set of blocks.
 
     // Calculate the number of stream-K units that would be needed if each stream-K unit
-    // computed the minimum allowable k iterations. Truncate this to be in units of clusters.
+    // computed the minimum allowable k iterations.
 
     // Number of k iterations computed by the stream-K units as a whole
     uint64_t k_tiles_sk_total = k_tiles_per_output_tile * sk_tiles;
 
     // Calculate the number of stream-K units that would be needed if each stream-K unit
-    // computed the minimum allowable k iterations. Truncate this to be in units of clusters.
+    // computed the minimum allowable k iterations.
     uint64_t min_sized_sk_units = (k_tiles_sk_total / min_iters_per_sk_unit_);
 
-    uint64_t sk_units = platform::min(ctas_per_sk_wave, min_sized_sk_units);
+    uint64_t sk_units = platform::min(wgs_per_sk_wave, min_sized_sk_units);
     return sk_units;
   }
 
@@ -528,23 +517,19 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
         problem_blocks,
         new_hw_info
       );
-      uint64_t ctas_per_wave = grid.x * grid.y;
+      uint64_t wgs_per_wave = grid.x * grid.y;
       uint32_t sk_tiles = get_num_sk_tiles(
         output_tiles,
-        ctas_per_wave,
+        wgs_per_wave,
         static_cast<uint32_t>(k_tiles_per_output_tile),
         decomposition_mode
       );
-      uint64_t ctas_per_sk_wave = ctas_per_wave;
-      uint64_t sk_units = get_num_sk_units(ctas_per_sk_wave, sk_tiles, k_tiles_per_output_tile);
+      uint64_t wgs_per_sk_wave = wgs_per_wave;
+      uint64_t sk_units = get_num_sk_units(wgs_per_sk_wave, sk_tiles, k_tiles_per_output_tile);
       uint64_t dp_tiles = output_tiles - sk_tiles;
 
       uint64_t reduction_tiles = sk_tiles;
 
-      // Though separate reduction requires a larger reduction workspace, only one barrier
-      // is needed per output tile. Each peer will increment the barrier by one once the peer has
-      // written its accumulator to scratch space. The separate reduction unit will only begin
-      // performing the reduction when the barrier has reached the number of peers for the output tile.
       barrier_workspace_size = get_barrier_workspace_size(sk_tiles, barrier_bits);
       reduction_workspace_size = get_reduction_workspace_size(reduction_tiles, tile_shape, accumulator_bits);
     }
@@ -562,7 +547,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
     uint32_t barrier_bits,
     uint32_t element_accumulator_bits) {
 
-    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, tile_shape);
+    dim3 problem_blocks = get_tiled_wg_shape_mnl(problem_shape, tile_shape);
     uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
 
     return get_workspace_size(
@@ -577,8 +562,8 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
     );
   }
 
-  // Version of get_workspace_size that takes in as input the number of CTAs in the M and N dimensions.
-  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // Version of get_workspace_size that takes in as input the number of work-groups in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or work-group shape has rank > 1,
   // for which using CuTe algebra for calculating tile shapes is easiest.
   static size_t
   get_workspace_size(
@@ -623,7 +608,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
     uint32_t barrier_bits,
     uint32_t element_accumulator_bits) {
 
-    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, tile_shape);
+    dim3 problem_blocks = get_tiled_wg_shape_mnl(problem_shape, tile_shape);
     uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
 
     return initialize_workspace(
@@ -639,8 +624,8 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
     );
   }
 
-  // Version of initialize_workspace that takes in as input the number of CTAs in the M and N dimensions.
-  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // Version of initialize_workspace that takes in as input the number of work-groups in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or work-group shape has rank > 1,
   // for which using CuTe algebra for calculating tile shapes is easiest.
   static cutlass::Status
   initialize_workspace(
diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
index 554724c39b..69ad69a374 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
@@ -155,7 +155,7 @@ class PersistentTileSchedulerIntelPVCStreamK {
     static_assert(cute::is_static<TileShape>::value);
 
     auto problem_shape_mnkl = cute::append<4>(problem_shape, cute::Int<1>{});
-    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape);
+    dim3 problem_blocks = get_tiled_wg_shape_mnl(problem_shape_mnkl, tile_shape);
     uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
 
     Params params;
@@ -250,8 +250,8 @@ class PersistentTileSchedulerIntelPVCStreamK {
   template <class ProblemShape>
   CUTLASS_HOST_DEVICE static
   dim3
-  get_tiled_cta_shape_mnl(ProblemShape problem_shape_mnkl, TileShape cta_shape) {
-    return Params::get_tiled_cta_shape_mnl(to_gemm_coord(problem_shape_mnkl), to_gemm_coord(cta_shape));
+  get_tiled_wg_shape_mnl(ProblemShape problem_shape_mnkl, TileShape cta_shape) {
+    return Params::get_tiled_wg_shape_mnl(to_gemm_coord(problem_shape_mnkl), to_gemm_coord(cta_shape));
   }
 
   // Given the cluster shape, computes the physical grid we should launch.
@@ -264,7 +264,7 @@ class PersistentTileSchedulerIntelPVCStreamK {
     KernelHardwareInfo hw_info) {
 
     auto problem_shape_mnkl = cute::append<4>(problem_shape, cute::Int<1>{});
-    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape);
+    dim3 problem_blocks = get_tiled_wg_shape_mnl(problem_shape_mnkl, tile_shape);
 
     return Params::get_grid_shape(
       problem_blocks,
@@ -428,7 +428,7 @@ template <int ThreadsPerBlock, class FrgTensorC>
 
     TileShape tile_shape;
 
-    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape);
+    dim3 problem_blocks = get_tiled_wg_shape_mnl(problem_shape_mnkl, tile_shape);
     uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
 
     return Params::get_workspace_size(
@@ -455,7 +455,7 @@ template <int ThreadsPerBlock, class FrgTensorC>
 
     TileShape tile_shape;
 
-    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape);
+    dim3 problem_blocks = get_tiled_wg_shape_mnl(problem_shape_mnkl, tile_shape);
     uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
 
     return Params::initialize_workspace(
@@ -674,7 +674,7 @@ CUTLASS_DEVICE
 
     params.divmod_batch_(work_idx_l, remainder, output_tile_id);
 
-    uint64_t cta_per_grid_dim = remainder; //params.divmod_cluster_shape_minor_.divide(remainder);
+    uint64_t cta_per_grid_dim = remainder;
 
     auto [work_idx_m, work_idx_n] = Params::get_work_idx_m_and_n(
                                           cta_per_grid_dim,
@@ -685,11 +685,6 @@ CUTLASS_DEVICE
     work_tile_info.M_idx = work_idx_m;
     work_tile_info.N_idx = work_idx_n;
     work_tile_info.L_idx = work_idx_l;
-
-    // if(ThreadIdxX() == 0)
-    //   printf("BlockID: %lu | k_tile_count: %d | M_idx: %lu | N_idx: %lu | K_idx: %lu | L_idx: %lu | ctas_per_grid_dim: %lu | output_tile_id: %lu\n",
-    //         BlockIdxX(), work_tile_info.k_tile_count, work_tile_info.M_idx, work_tile_info.N_idx, work_tile_info.K_idx,
-    //         work_tile_info.L_idx, remainder, output_tile_id);
   }
 
 };

From 8924208b6fbf38e9bdc6cfff8fd87c9632514bc7 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Wed, 4 Sep 2024 14:50:33 +0100
Subject: [PATCH 20/30] Add separate example for StreamK

---
 examples/sycl/pvc/CMakeLists.txt       |   5 +
 examples/sycl/pvc/pvc_gemm.cpp         |  10 +-
 examples/sycl/pvc/pvc_gemm_streamk.cpp | 383 +++++++++++++++++++++++++
 3 files changed, 390 insertions(+), 8 deletions(-)
 create mode 100644 examples/sycl/pvc/pvc_gemm_streamk.cpp

diff --git a/examples/sycl/pvc/CMakeLists.txt b/examples/sycl/pvc/CMakeLists.txt
index 322896e20e..f9c5fca18c 100644
--- a/examples/sycl/pvc/CMakeLists.txt
+++ b/examples/sycl/pvc/CMakeLists.txt
@@ -41,3 +41,8 @@ cutlass_example_add_executable(
   pvc_collective_builder
   pvc_collective_builder.cpp
 )
+
+cutlass_example_add_executable(
+  pvc_gemm_streamk
+  pvc_gemm_streamk.cpp
+)
diff --git a/examples/sycl/pvc/pvc_gemm.cpp b/examples/sycl/pvc/pvc_gemm.cpp
index 18e2b48c10..5316e8f2cc 100644
--- a/examples/sycl/pvc/pvc_gemm.cpp
+++ b/examples/sycl/pvc/pvc_gemm.cpp
@@ -47,7 +47,6 @@
 #include "cutlass/util/reference/device/tensor_compare.h"
 #include "common.h"
 
-#include "cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp"
 using namespace cute;
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -220,9 +219,7 @@ struct ExampleRunner {
       problem_size,
       {block_A.get(), stride_A, block_B.get(), stride_B},
       {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
-      hw_info,
-      {16, 
-      cutlass::gemm::kernel::detail::PersistentTileSchedulerIntelPVCStreamKParams::DecompositionMode::SplitK}
+      hw_info
     };
 
     Gemm gemm_op;
@@ -247,8 +244,6 @@ struct ExampleRunner {
       GPU_Clock timer;
       timer.start();
       for (int i = 0; i < options.iterations; ++i) {
-        if(workspace_size > 0)
-          Gemm::GemmKernel::initialize_workspace(arguments, workspace.get());
         gemm_op.run();
       }
       syclcompat::wait();
@@ -359,8 +354,7 @@ int main(int argc, const char** argv)
   using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
   Shape<int, int, int, int>,
   CollectiveMainloop,
-  CollectiveEpilogue,
-  cutlass::gemm::StreamKScheduler
+  CollectiveEpilogue
   >;
 
   using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
diff --git a/examples/sycl/pvc/pvc_gemm_streamk.cpp b/examples/sycl/pvc/pvc_gemm_streamk.cpp
new file mode 100644
index 0000000000..a28c6b18d7
--- /dev/null
+++ b/examples/sycl/pvc/pvc_gemm_streamk.cpp
@@ -0,0 +1,383 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/collective/intel_pvc_epilogue.hpp"
+#include "cutlass/epilogue/fusion/intel_pvc_callbacks.hpp"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/collective/collective_mma.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+
+#include <cute/tensor.hpp>
+#include <random>
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "common.h"
+
+#include "cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp"
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+  bool splitk;
+
+  int m, n, k, l, iterations, splits;
+  float alpha, beta;
+
+  Options():
+    help(false),
+    error(false),
+    splitk(false),
+    m(5120), n(4096), k(4096), l(1), iterations(20), splits(1),
+    alpha(1.f), beta(0.f)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    if (cmd.check_cmd_line_flag("splitk")) {
+      splitk = true;
+    }
+
+    cmd.get_cmd_line_argument("m", m, 5120);
+    cmd.get_cmd_line_argument("n", n, 4096);
+    cmd.get_cmd_line_argument("k", k, 4096);
+    cmd.get_cmd_line_argument("l", l, 1);
+    cmd.get_cmd_line_argument("splits", splits, 16);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("iterations", iterations, 100);
+    cmd.get_cmd_line_argument("splits", splits, 1);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "PVC GEMM Example\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --splitk                 If specified, uses SplitK decomposition\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   Sets the L extent (batch count) of the GEMM\n"
+      << "  --splits=<int>              Sets the splitting factor for GEMM\n"
+      << "  --alpha=<s32>               Epilogue scalar alpha\n"
+      << "  --beta=<s32>                Epilogue scalar beta\n\n"
+      << "  --iterations=<int>          Iterations\n\n";
+
+    return out;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class Gemm
+>
+struct ExampleRunner {
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+
+  using LayoutA = typename Gemm::LayoutA;
+  using LayoutB = typename Gemm::LayoutB;
+  using LayoutC = typename Gemm::LayoutC;
+  using LayoutD = typename Gemm::LayoutD;
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementAcc = typename Gemm::ElementAccumulator;
+
+  using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
+  using ElementC = typename Gemm::ElementC;
+  using ElementOutput = typename CollectiveEpilogue::ElementOutput;
+  using ElementCompute = typename CollectiveEpilogue::ElementCompute;
+  using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
+
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  //
+  // Data members
+  //
+
+  /// Initialization
+  StrideA stride_A;
+  StrideB stride_B;
+  StrideC stride_C;
+  StrideD stride_D;
+  uint64_t seed = 0;
+
+  cutlass::DeviceAllocation<ElementA> block_A;
+  cutlass::DeviceAllocation<ElementB> block_B;
+  cutlass::DeviceAllocation<ElementC> block_C;
+  cutlass::DeviceAllocation<ElementOutput> block_D;
+  cutlass::DeviceAllocation<ElementOutput> block_ref_D;
+
+  //
+  // Methods
+  //
+
+  bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
+    auto [M, N, K, L] = problem_size;
+
+    cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K}));
+    cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N}));
+    cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N}));
+    cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));
+
+    cutlass::reference::device::GemmComplex(
+          {M, N, K},
+          alpha,
+          ref_A,
+          cutlass::ComplexTransform::kNone,
+          ref_B,
+          cutlass::ComplexTransform::kNone,
+          beta,
+          ref_C,
+          ref_D,
+          ElementAccumulator(0),
+          L,     // batch_count
+          M * K, // batch_stride_A
+          K * N, // batch_stride_B
+          M * N, // batch_stride_C
+          M * N  // batch_stride_D
+        );
+
+    syclcompat::wait();
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    bool passed = cutlass::reference::device::BlockCompareEqual(
+      block_ref_D.get(), block_D.get(), block_D.size());
+
+    return passed;
+  }
+
+  /// Initialize operands to be used in the GEMM and reference GEMM
+  void initialize(const ProblemShapeType& problem_size) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+    stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+    stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+    block_A.reset(M * K * L);
+    block_B.reset(K * N * L);
+    block_C.reset(M * N * L);
+    block_D.reset(M * N * L);
+    block_ref_D.reset(M * N * L);
+
+    initialize_block(block_A, seed + 2023);
+    initialize_block(block_B, seed + 2022);
+    initialize_block(block_C, seed + 2021);
+  }
+
+  void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
+    ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
+
+    initialize(problem_size);
+
+    typename Gemm::GemmKernel::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_size,
+      {block_A.get(), stride_A, block_B.get(), stride_B},
+      {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
+      hw_info,
+      {options.splits, 
+      options.splitk ? cutlass::gemm::kernel::detail::PersistentTileSchedulerIntelPVCStreamKParams::DecompositionMode::SplitK :
+                          cutlass::gemm::kernel::detail::PersistentTileSchedulerIntelPVCStreamKParams::DecompositionMode::StreamK}
+    };
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    gemm_op.can_implement(arguments);
+
+    gemm_op.initialize(arguments, workspace.get());
+
+    // Run the GEMM
+    gemm_op.run();
+
+    syclcompat::wait();
+
+    // Verify that the result is correct
+    bool passed = verify(problem_size, options.alpha, options.beta);
+    std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
+
+    if (passed && options.iterations > 0) {
+      GPU_Clock timer;
+      timer.start();
+      for (int i = 0; i < options.iterations; ++i) {
+        Gemm::GemmKernel::initialize_workspace(arguments, workspace.get());
+        gemm_op.run();
+      }
+      syclcompat::wait();
+
+      float cute_time = timer.seconds() / options.iterations;
+      double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
+      std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+      printf("Cutlass GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
+    }
+
+    return;
+  }
+
+};
+
+int main(int argc, const char** argv)
+{
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  //
+  // Run examples
+  //
+
+  // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
+  // information is used by the underlying kernel.
+  cutlass::KernelHardwareInfo hw_info;
+
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  bool passed;
+
+  // The code section below describes datatype for input, output matrices and computation between
+  // elements in input matrices.
+  using ElementAccumulator = float;                   // <- data type of accumulator
+  using ElementComputeEpilogue = float;  // <- data type of epilogue operations
+  using ElementInputA = bfloat16_t;                        // <- data type of elements in input matrix A
+  using ElementInputB = bfloat16_t;                        // <- data type of elements in input matrix B
+  using ElementOutput = float;                        // <- data type of elements in output matrix D
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+
+  using GmemTiledCopyA = XE_2D_U16x8x16x4x2_LD_N;
+  using GmemTiledCopyB = XE_2D_U16x16x16x2x2_V;
+
+  // Workgroup-level tile
+  using TileShape = Shape<_256, _256, _32>;
+
+  using TiledMma = TiledMMA<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>,
+          Layout<Shape<_1,_1,_1>>,
+          Tile<_32,_64,_32>>; // Subgroup level-tile
+
+  constexpr int PipelineStages = 3;
+  using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelPVC<PipelineStages>;
+  using EpilogueDispatchPolicy = cutlass::epilogue::IntelPVCEpilogue;
+
+  using EpilogueOp = cutlass::epilogue::fusion::LinearCombination<ElementOutput, ElementComputeEpilogue,
+          ElementAccumulator, ElementAccumulator, cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using FusionCallBacks = cutlass::epilogue::fusion::FusionCallbacks<EpilogueDispatchPolicy, EpilogueOp, TileShape,
+          decltype(tile_shape(TiledMma()))>;
+  using CollectiveEpilogue = cutlass::epilogue::collective::CollectiveEpilogue<
+          EpilogueDispatchPolicy,
+          TileShape,
+          ElementAccumulator,
+          cutlass::gemm::TagToStrideC_t<LayoutC>,
+          ElementOutput,
+          cutlass::gemm::TagToStrideC_t<LayoutD>,
+          FusionCallBacks,
+          XE_2D_U32x8x16x1x1_LD_N,
+          void, void,
+          XE_2D_U32x8x16x1x1_ST_N,
+          void, void>;
+
+// Mainloop
+  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+          GEMMDispatchPolicy,
+          TileShape,
+          ElementInputA,
+          cutlass::gemm::TagToStrideA_t<LayoutA>,
+          ElementInputB,
+          cutlass::gemm::TagToStrideB_t<LayoutB>,
+          TiledMma,
+          GmemTiledCopyA, void, void, cute::identity,  // A
+          GmemTiledCopyB, void, void, cute::identity   // B
+  >;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+  Shape<int, int, int, int>,
+  CollectiveMainloop,
+  CollectiveEpilogue,
+  cutlass::gemm::StreamKScheduler
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  ExampleRunner<Gemm> runner;
+
+  runner.run(options, hw_info);
+
+  return 0;
+}

From e8b2d2496586c29bb69bebd3dac7b1a3d2382f5f Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Fri, 6 Sep 2024 16:39:20 +0100
Subject: [PATCH 21/30] Address feedback for split barrier

---
 cmake/FindDPCPP.cmake          |  2 +-
 include/cutlass/arch/barrier.h | 15 ++++++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake
index ba60ed2f8b..3574ea1604 100644
--- a/cmake/FindDPCPP.cmake
+++ b/cmake/FindDPCPP.cmake
@@ -58,7 +58,7 @@ if(NOT "${DPCPP_SYCL_ARCH}" STREQUAL "")
 endif()
 
 if("${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_pvc")
-  list(APPEND DPCPP_FLAGS "-Xspirv-translator;--spirv-ext=+SPV_INTEL_split_barrier;-DSPV_INTEL_split_barrier;")
+  list(APPEND DPCPP_FLAGS "-Xspirv-translator;--spirv-ext=+SPV_INTEL_split_barrier")
 endif()
 
 if(UNIX)
diff --git a/include/cutlass/arch/barrier.h b/include/cutlass/arch/barrier.h
index 756f21971d..db2777e24a 100644
--- a/include/cutlass/arch/barrier.h
+++ b/include/cutlass/arch/barrier.h
@@ -38,8 +38,13 @@
 #include <cute/arch/cluster_sm90.hpp>
 
 #if defined SYCL_INTEL_TARGET
-SYCL_EXTERNAL __attribute__((convergent)) void __spirv_ControlBarrierWaitINTEL(int, int, int);
-SYCL_EXTERNAL __attribute__((convergent)) void __spirv_ControlBarrierArriveINTEL(int, int, int);
+SYCL_EXTERNAL __attribute__((convergent)) void __spirv_ControlBarrierWaitINTEL(int execution_scope, int memory_scope, int memory_semantics);
+SYCL_EXTERNAL __attribute__((convergent)) void __spirv_ControlBarrierArriveINTEL(int execution_scope, int memory_scope, int memory_semantics);
+
+#define EXECUTION_SCOPE_WORK_GROUP 2
+#define MEMORY_SCOPE_WORK_GROUP 2
+#define MEMORY_SEMANTICS_RELAXED 0
+
 #elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && (__CUDACC_VER_MAJOR__ >= 12)
 #define CUDA_BARRIER_ENABLED 1
 #else
@@ -156,8 +161,8 @@ class NamedBarrier {
   CUTLASS_DEVICE
   static void arrive_and_wait_internal(uint32_t num_threads, uint32_t barrier_id) {
 #if defined SYCL_INTEL_TARGET
-    __spirv_ControlBarrierArriveINTEL(2, 2, 0x0);
-    __spirv_ControlBarrierWaitINTEL(2, 2, 0x0);
+    __spirv_ControlBarrierArriveINTEL(EXECUTION_SCOPE_WORK_GROUP, MEMORY_SCOPE_WORK_GROUP, MEMORY_SEMANTICS_RELAXED);
+    __spirv_ControlBarrierWaitINTEL(EXECUTION_SCOPE_WORK_GROUP, MEMORY_SCOPE_WORK_GROUP, MEMORY_SEMANTICS_RELAXED);
 #elif defined CUDA_BARRIER_ENABLED
     asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "r"(num_threads));
 #elif defined(__CUDA_ARCH__)
@@ -168,7 +173,7 @@ class NamedBarrier {
   CUTLASS_DEVICE
   static void arrive_internal(uint32_t num_threads, uint32_t barrier_id) {
 #if defined SYCL_INTEL_TARGET
-    __spirv_ControlBarrierArriveINTEL(2, 2, 0x0);
+    __spirv_ControlBarrierArriveINTEL(EXECUTION_SCOPE_WORK_GROUP, MEMORY_SCOPE_WORK_GROUP, MEMORY_SEMANTICS_RELAXED);
 #elif CUDA_BARRIER_ENABLED
     asm volatile("bar.arrive %0, %1;" : : "r"(barrier_id), "r"(num_threads));
 #elif defined(__CUDA_ARCH__)

From c3875e7330edae164a03bb7ac40bf1a51938e885 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Mon, 9 Sep 2024 10:04:00 +0100
Subject: [PATCH 22/30] Fix address space for atomicAdd

* Instantiate new accumulator registers per iteration
---
 include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp | 4 +---
 include/cutlass/gpu_generics.h                         | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp
index f699a8fd6b..549350d893 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp
@@ -259,8 +259,6 @@ class GemmUniversal<
     CollectiveMainloop collective_mma;
     CollectiveEpilogue epilogue{params.epilogue, shared_storage.epilogue};
 
-    Tensor accumulators = make_tensor<ElementAccumulator>(Shape<Int<VecC>, Int<FragsM>, Int<FragsN>>{});
-
     const int m_offset = sub_group_id / CollectiveMainloop::sg_per_wg_n * get<0>(subgroup_shape);
     const int n_offset = sub_group_id % CollectiveMainloop::sg_per_wg_n * get<1>(subgroup_shape);
 
@@ -290,7 +288,7 @@ class GemmUniversal<
       auto n_max_coord = N - get<1>(subgroup_shape) * n_coord;                             // N - SUB_N * n_coord
       auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue);
 
-      clear(accumulators);
+      Tensor accumulators = make_tensor<ElementAccumulator>(Shape<Int<VecC>, Int<FragsM>, Int<FragsN>>{});
 
       // Perform the collective scoped MMA
       collective_mma(
diff --git a/include/cutlass/gpu_generics.h b/include/cutlass/gpu_generics.h
index a406791437..44b5a92acb 100644
--- a/include/cutlass/gpu_generics.h
+++ b/include/cutlass/gpu_generics.h
@@ -317,7 +317,7 @@ using dim3 = syclcompat::dim3;
 template <typename T>
 CUTLASS_DEVICE T atomicAdd(T *address, T val) {
 #if defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::atomic_fetch_add(address, val);
+  return syclcompat::atomic_fetch_add<sycl::access::address_space::global_space>(address, val);
 #endif
   return 0;
 }

From 7cfbf62e4f791698d3be75742dec78c721df9f44 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Mon, 9 Sep 2024 12:03:06 +0100
Subject: [PATCH 23/30] Renamed the pipeline file

---
 include/cutlass/gemm/kernel/gemm_universal.hpp                  | 2 +-
 ...ntel_pvc_gemm_streamk.hpp => intel_pvc_gemm_cooperative.hpp} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename include/cutlass/gemm/kernel/{intel_pvc_gemm_streamk.hpp => intel_pvc_gemm_cooperative.hpp} (100%)

diff --git a/include/cutlass/gemm/kernel/gemm_universal.hpp b/include/cutlass/gemm/kernel/gemm_universal.hpp
index 3b601005d4..0d063db130 100644
--- a/include/cutlass/gemm/kernel/gemm_universal.hpp
+++ b/include/cutlass/gemm/kernel/gemm_universal.hpp
@@ -65,6 +65,6 @@ struct IsCutlass3ArrayKernel<ProblemShape, cute::void_t<typename ProblemShape::U
 
 #if defined(SYCL_INTEL_TARGET)
 #include "cutlass/gemm/kernel/intel_pvc_gemm.hpp"
-#include "cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp"
+#include "cutlass/gemm/kernel/intel_pvc_gemm_cooperative.hpp"
 #endif
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp b/include/cutlass/gemm/kernel/intel_pvc_gemm_cooperative.hpp
similarity index 100%
rename from include/cutlass/gemm/kernel/intel_pvc_gemm_streamk.hpp
rename to include/cutlass/gemm/kernel/intel_pvc_gemm_cooperative.hpp

From 93d87fc10dfacbff24fc6c5a9aef73f4f5ebae91 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Mon, 16 Sep 2024 16:19:57 +0100
Subject: [PATCH 24/30] Renamed files to xe_*

* Removed l2 workspace alignment
---
 examples/sycl/pvc/pvc_gemm_streamk.cpp        |  7 +++----
 .../cutlass/gemm/kernel/gemm_universal.hpp    |  2 +-
 .../cutlass/gemm/kernel/tile_scheduler.hpp    |  4 ++--
 ...ooperative.hpp => xe_gemm_cooperative.hpp} |  0
 ...sistent_tile_scheduler_params_streamk.hpp} | 19 +++++--------------
 ...eamk.hpp => xe_tile_scheduler_streamk.hpp} | 12 ++++++------
 6 files changed, 17 insertions(+), 27 deletions(-)
 rename include/cutlass/gemm/kernel/{intel_pvc_gemm_cooperative.hpp => xe_gemm_cooperative.hpp} (100%)
 rename include/cutlass/gemm/kernel/{intel_pvc_persistent_tile_scheduler_params_streamk.hpp => xe_persistent_tile_scheduler_params_streamk.hpp} (97%)
 rename include/cutlass/gemm/kernel/{intel_pvc_tile_scheduler_streamk.hpp => xe_tile_scheduler_streamk.hpp} (98%)

diff --git a/examples/sycl/pvc/pvc_gemm_streamk.cpp b/examples/sycl/pvc/pvc_gemm_streamk.cpp
index a28c6b18d7..2c09eb6731 100644
--- a/examples/sycl/pvc/pvc_gemm_streamk.cpp
+++ b/examples/sycl/pvc/pvc_gemm_streamk.cpp
@@ -47,7 +47,7 @@
 #include "cutlass/util/reference/device/tensor_compare.h"
 #include "common.h"
 
-#include "cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp"
+#include "cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp"
 using namespace cute;
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -87,7 +87,6 @@ struct Options {
     cmd.get_cmd_line_argument("n", n, 4096);
     cmd.get_cmd_line_argument("k", k, 4096);
     cmd.get_cmd_line_argument("l", l, 1);
-    cmd.get_cmd_line_argument("splits", splits, 16);
     cmd.get_cmd_line_argument("alpha", alpha, 1.f);
     cmd.get_cmd_line_argument("beta", beta, 0.f);
     cmd.get_cmd_line_argument("iterations", iterations, 100);
@@ -232,8 +231,8 @@ struct ExampleRunner {
       {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
       hw_info,
       {options.splits, 
-      options.splitk ? cutlass::gemm::kernel::detail::PersistentTileSchedulerIntelPVCStreamKParams::DecompositionMode::SplitK :
-                          cutlass::gemm::kernel::detail::PersistentTileSchedulerIntelPVCStreamKParams::DecompositionMode::StreamK}
+      options.splitk ? cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::SplitK :
+                          cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::StreamK}
     };
 
     Gemm gemm_op;
diff --git a/include/cutlass/gemm/kernel/gemm_universal.hpp b/include/cutlass/gemm/kernel/gemm_universal.hpp
index 0d063db130..eced02115a 100644
--- a/include/cutlass/gemm/kernel/gemm_universal.hpp
+++ b/include/cutlass/gemm/kernel/gemm_universal.hpp
@@ -65,6 +65,6 @@ struct IsCutlass3ArrayKernel<ProblemShape, cute::void_t<typename ProblemShape::U
 
 #if defined(SYCL_INTEL_TARGET)
 #include "cutlass/gemm/kernel/intel_pvc_gemm.hpp"
-#include "cutlass/gemm/kernel/intel_pvc_gemm_cooperative.hpp"
+#include "cutlass/gemm/kernel/xe_gemm_cooperative.hpp"
 #endif
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/tile_scheduler.hpp b/include/cutlass/gemm/kernel/tile_scheduler.hpp
index 524884ed6f..33fc7d5015 100644
--- a/include/cutlass/gemm/kernel/tile_scheduler.hpp
+++ b/include/cutlass/gemm/kernel/tile_scheduler.hpp
@@ -41,7 +41,7 @@
 #include "cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp"
 #include "cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp"
 #if defined (SYCL_INTEL_TARGET)
-#include "cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp"
+#include "cutlass/gemm/kernel/xe_tile_scheduler_streamk.hpp"
 #endif
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -141,7 +141,7 @@ struct TileSchedulerSelector<
   TileShape,
   ClusterShape
   > {
-  using Scheduler = PersistentTileSchedulerIntelPVCStreamK<TileShape>;
+  using Scheduler = PersistentTileSchedulerXeStreamK<TileShape>;
 };
 #endif
 
diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm_cooperative.hpp b/include/cutlass/gemm/kernel/xe_gemm_cooperative.hpp
similarity index 100%
rename from include/cutlass/gemm/kernel/intel_pvc_gemm_cooperative.hpp
rename to include/cutlass/gemm/kernel/xe_gemm_cooperative.hpp
diff --git a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp b/include/cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp
similarity index 97%
rename from include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp
rename to include/cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp
index 85fcfdbefb..443e2cf100 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp
+++ b/include/cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp
@@ -48,8 +48,8 @@ namespace kernel {
 namespace detail {
 
 ////////////////////////////////////////////////////////////////////////////////
-// Parameters for Intel PVC persistent stream-K scheduler
-struct PersistentTileSchedulerIntelPVCStreamKParams {
+// Parameters for Xe persistent stream-K scheduler
+struct PersistentTileSchedulerXeStreamKParams {
 
   // Strategies for computing reductions between work-groups computing portions of a given output tile
   enum class ReductionMode {
@@ -88,7 +88,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
   FastDivmodU64 divmod_blk_major_{};
 
   // Divide up the number of stream-K tiles amongst G groups of stream-K units.
-  // Currently defaults to 1 since we don't create groups for PVC.
+  // Currently defaults to 1 since we don't create groups for Xe.
   FastDivmodU64 divmod_sk_groups_{};
 
   // Number of stream-K units in each group
@@ -464,7 +464,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
   static size_t
   get_barrier_workspace_size(uint64_t num_tiles, uint32_t barrier_bits) {
     size_t workspace_bits = num_tiles * static_cast<size_t>(barrier_bits);
-    return round_up_to_l2_alignment(bits_to_bytes<size_t>(workspace_bits));
+    return bits_to_bytes<size_t>(workspace_bits);
   }
 
   // Calculates the size of the workspace needed for holding partial outputs from splits
@@ -473,7 +473,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
   get_reduction_workspace_size(uint64_t num_tiles, GemmCoord tile_shape, uint32_t accumulator_bits, uint32_t num_accumulator_mtxs = 1) {
     size_t output_tile_size = tile_shape.m() * tile_shape.n();
     size_t workspace_bits = accumulator_bits * output_tile_size * num_tiles * num_accumulator_mtxs;
-    return round_up_to_l2_alignment(bits_to_bytes<size_t>(workspace_bits));
+    return bits_to_bytes<size_t>(workspace_bits);
   }
 
   static void
@@ -695,15 +695,6 @@ struct PersistentTileSchedulerIntelPVCStreamKParams {
     sk_units_ = 0;
     divmod_sk_units_per_group_ = FastDivmodU64(blocks_m * blocks_n * blocks_l);
   }
-
-  private:
-  // Round up number of bytes to the nearest multiple of L2 cache line alignment
-  CUTLASS_HOST_DEVICE
-  static size_t
-  round_up_to_l2_alignment(size_t bytes) {
-    constexpr size_t L2CacheLineSizeBytes = 128u;
-    return (bytes + L2CacheLineSizeBytes - 1) / L2CacheLineSizeBytes * L2CacheLineSizeBytes;
-  }
 };
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/xe_tile_scheduler_streamk.hpp
similarity index 98%
rename from include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
rename to include/cutlass/gemm/kernel/xe_tile_scheduler_streamk.hpp
index 69ad69a374..1caf2cba0a 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp
+++ b/include/cutlass/gemm/kernel/xe_tile_scheduler_streamk.hpp
@@ -38,7 +38,7 @@
 #include "cutlass/kernel_hardware_info.hpp"
 #include "cute/layout.hpp"
 #include "cute/tensor.hpp"
-#include "cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp"
+#include "cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp"
 
 namespace cutlass::gemm::kernel::detail {
 
@@ -46,7 +46,7 @@ namespace cutlass::gemm::kernel::detail {
 template <
   class TileShape
 >
-class PersistentTileSchedulerIntelPVCStreamK {
+class PersistentTileSchedulerXeStreamK {
   //
   // Data members
   //
@@ -59,7 +59,7 @@ class PersistentTileSchedulerIntelPVCStreamK {
   // Use a dummy barrier manager to simply get the type used to store the barrier
   using BarrierType = typename NamedBarrierManager<1>::T;
 
-  using Params = PersistentTileSchedulerIntelPVCStreamKParams;
+  using Params = PersistentTileSchedulerXeStreamKParams;
   using ReductionMode = Params::ReductionMode;
   using DecompositionMode = Params::DecompositionMode;
 
@@ -180,10 +180,10 @@ class PersistentTileSchedulerIntelPVCStreamK {
   }
 
   CUTLASS_HOST_DEVICE
-  PersistentTileSchedulerIntelPVCStreamK() { };
+  PersistentTileSchedulerXeStreamK() { };
 
   CUTLASS_HOST_DEVICE
-  PersistentTileSchedulerIntelPVCStreamK(Params const& params_) : scheduler_params(params_) {
+  PersistentTileSchedulerXeStreamK(Params const& params_) : scheduler_params(params_) {
     current_work_linear_idx_ = uint64_t(BlockIdxX());
   }
 
@@ -324,7 +324,7 @@ template <int ThreadsPerBlock, class FrgTensorC>
     int barrier_group_thread_idx = ThreadIdxX();
 
     // Reductions use BlockStripedReduce with a width of BarrierManager::ThreadCount under the hood.
-    // Thus, the start of the reduction space is the same across all threads in a warp group.
+    // Thus, the start of the reduction space is the same across all threads in a work group.
     int reduction_offset =
       (cute::size<0>(TileShape{}) * cute::size<1>(TileShape{}) * reduction_tile_idx * num_accumulator_mtxs) +
       reduction_peer_offset;

From e2a0d9b3524c220009980c1a81515cc217e1ea27 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Wed, 18 Sep 2024 14:58:12 +0100
Subject: [PATCH 25/30] Update the example to reduce caching effects

---
 examples/sycl/pvc/common.h             |  4 ++
 examples/sycl/pvc/pvc_gemm_streamk.cpp | 67 ++++++++++++++++++--------
 2 files changed, 52 insertions(+), 19 deletions(-)

diff --git a/examples/sycl/pvc/common.h b/examples/sycl/pvc/common.h
index cd11b1c7c9..915728da0d 100644
--- a/examples/sycl/pvc/common.h
+++ b/examples/sycl/pvc/common.h
@@ -58,3 +58,7 @@ bool initialize_block(
        block.get(), block.size(), seed, scope_max, scope_min, 0);
   return true;
 }
+
+size_t get_llc_size() {
+  return syclcompat::get_default_queue().get_device().get_info<sycl::info::device::global_mem_cache_size>();   
+}
diff --git a/examples/sycl/pvc/pvc_gemm_streamk.cpp b/examples/sycl/pvc/pvc_gemm_streamk.cpp
index 2c09eb6731..ed98dc86d0 100644
--- a/examples/sycl/pvc/pvc_gemm_streamk.cpp
+++ b/examples/sycl/pvc/pvc_gemm_streamk.cpp
@@ -52,6 +52,8 @@ using namespace cute;
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
+#define CUTLASS_SYCL_PROFILING_ENABLED
+
 // Command line options parsing
 struct Options {
 
@@ -142,6 +144,8 @@ struct ExampleRunner {
 
   using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
 
+  int32_t count;
+
   //
   // Data members
   //
@@ -153,9 +157,9 @@ struct ExampleRunner {
   StrideD stride_D;
   uint64_t seed = 0;
 
-  cutlass::DeviceAllocation<ElementA> block_A;
-  cutlass::DeviceAllocation<ElementB> block_B;
-  cutlass::DeviceAllocation<ElementC> block_C;
+  std::vector<cutlass::DeviceAllocation<ElementA>> block_A;
+  std::vector<cutlass::DeviceAllocation<ElementB>> block_B;
+  std::vector<cutlass::DeviceAllocation<ElementC>> block_C;
   cutlass::DeviceAllocation<ElementOutput> block_D;
   cutlass::DeviceAllocation<ElementOutput> block_ref_D;
 
@@ -166,9 +170,9 @@ struct ExampleRunner {
   bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
     auto [M, N, K, L] = problem_size;
 
-    cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K}));
-    cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N}));
-    cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N}));
+    cutlass::TensorRef ref_A(block_A[0].get(), LayoutA::packed({M, K}));
+    cutlass::TensorRef ref_B(block_B[0].get(), LayoutB::packed({K, N}));
+    cutlass::TensorRef ref_C(block_C[0].get(), LayoutC::packed({M, N}));
     cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));
 
     cutlass::reference::device::GemmComplex(
@@ -208,15 +212,27 @@ struct ExampleRunner {
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 
-    block_A.reset(M * K * L);
-    block_B.reset(K * N * L);
-    block_C.reset(M * N * L);
+    std::size_t mem_occupied_ABC = (M * K * L * sizeof(ElementA)) + (K * N * L * sizeof(ElementB)) + 
+                                   (M * N * L * sizeof(ElementC));
+    count = std::ceil(static_cast<float>(get_llc_size()) / static_cast<float>(mem_occupied_ABC));
+
+    for(int i = 0; i < count; i++) {
+      block_A.emplace_back();
+      block_B.emplace_back();
+      block_C.emplace_back();
+    }
+
+    for (int i = 0; i < count; i++) {
+      block_A[i].reset(M * K * L);
+      block_B[i].reset(K * N * L);
+      block_C[i].reset(M * N * L);
+      initialize_block(block_A[i], seed + i);
+      initialize_block(block_B[i], seed + i);
+      initialize_block(block_C[i], seed + i);
+    }
+
     block_D.reset(M * N * L);
     block_ref_D.reset(M * N * L);
-
-    initialize_block(block_A, seed + 2023);
-    initialize_block(block_B, seed + 2022);
-    initialize_block(block_C, seed + 2021);
   }
 
   void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
@@ -227,8 +243,8 @@ struct ExampleRunner {
     typename Gemm::GemmKernel::Arguments arguments{
       cutlass::gemm::GemmUniversalMode::kGemm,
       problem_size,
-      {block_A.get(), stride_A, block_B.get(), stride_B},
-      {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
+      {block_A[0].get(), stride_A, block_B[0].get(), stride_B},
+      {{options.alpha, options.beta}, block_C[0].get(), stride_C, block_D.get(), stride_D},
       hw_info,
       {options.splits, 
       options.splitk ? cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::SplitK :
@@ -255,14 +271,27 @@ struct ExampleRunner {
 
     if (passed && options.iterations > 0) {
       GPU_Clock timer;
-      timer.start();
+      float elapsed_time_seconds = 0.f;
       for (int i = 0; i < options.iterations; ++i) {
-        Gemm::GemmKernel::initialize_workspace(arguments, workspace.get());
+        int32_t idx = std::max(int(0), (i % count) - 1);
+        typename Gemm::GemmKernel::Arguments arguments{
+          cutlass::gemm::GemmUniversalMode::kGemm,
+          problem_size,
+          {block_A[idx].get(), stride_A, block_B[idx].get(), stride_B},
+          {{options.alpha, options.beta}, block_C[idx].get(), stride_C, block_D.get(), stride_D},
+          hw_info,
+          {options.splits, 
+          options.splitk ? cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::SplitK :
+                              cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::StreamK}
+        };
+        gemm_op.initialize(arguments, workspace.get());
+        timer.start();
         gemm_op.run();
+        syclcompat::wait();
+        elapsed_time_seconds += timer.seconds();
       }
-      syclcompat::wait();
 
-      float cute_time = timer.seconds() / options.iterations;
+      float cute_time = elapsed_time_seconds / options.iterations;
       double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
       std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
       printf("Cutlass GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time*1000);

From 6d1900082ef2632649528633e5adaabd0fa65920 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Fri, 20 Sep 2024 12:51:47 +0100
Subject: [PATCH 26/30] Refactor pipeline code

---
 .../gemm/kernel/xe_gemm_cooperative.hpp       | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/include/cutlass/gemm/kernel/xe_gemm_cooperative.hpp b/include/cutlass/gemm/kernel/xe_gemm_cooperative.hpp
index 549350d893..cd299ce6de 100644
--- a/include/cutlass/gemm/kernel/xe_gemm_cooperative.hpp
+++ b/include/cutlass/gemm/kernel/xe_gemm_cooperative.hpp
@@ -247,18 +247,6 @@ class GemmUniversal<
     constexpr auto workgroup_shape = WorkgroupTileShape{};                                                  // (BLK_M,BLK_N,BLK_K)
     constexpr auto subgroup_shape = SubgroupTileShape{};                                                  // (SUB_M,SUB_N,SUB_K)
 
-    constexpr int version =
-        is_same_v<typename CollectiveMainloop::GmemTiledCopyB,
-                  XE_2D_U16x16x16x2x1_V>
-            ? 1
-            : 2;
-
-    auto k_residue   = K - get<2>(subgroup_shape) * (K / get<2>(subgroup_shape));        // K - SUB_K * k_coord_max
-
-    TiledMma tiled_mma;
-    CollectiveMainloop collective_mma;
-    CollectiveEpilogue epilogue{params.epilogue, shared_storage.epilogue};
-
     const int m_offset = sub_group_id / CollectiveMainloop::sg_per_wg_n * get<0>(subgroup_shape);
     const int n_offset = sub_group_id % CollectiveMainloop::sg_per_wg_n * get<1>(subgroup_shape);
 
@@ -268,8 +256,8 @@ class GemmUniversal<
       const int l_coord = work_tile_info.L_idx;
 
       // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
-      auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, workgroup_shape);
-      auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+      const int work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, workgroup_shape);
+      const int work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
       auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, make_shape(K)), make_shape(K));
       const auto tile_coord = make_coord(m_coord, n_coord, _, l_coord);
 
@@ -278,11 +266,19 @@ class GemmUniversal<
               make_shape(_1{}, K, L),
               make_stride(Int<FragsM>{} * get<0>(MmaAtomShape()),_1{}));
 
+      constexpr int version =
+          is_same_v<typename CollectiveMainloop::GmemTiledCopyB,
+                    XE_2D_U16x16x16x2x1_V>
+              ? 1
+              : 2;
+
       Tensor tBi = params.mainloop.gmem_tiled_copy_b.get_pvc_tensor(
               make_coord(n_coord, 0, 0),
               make_shape(Int<FragsN / version>{}, K, L),
               make_stride(Int<version * get<1>(MmaAtomShape())>{}, _1{}));
 
+      auto k_residue   = K - get<2>(subgroup_shape) * (K / get<2>(subgroup_shape));        // K - SUB_K * k_coord_max
+
       // Compute tile residues for predication
       auto m_max_coord = M - get<0>(subgroup_shape) * m_coord;                             // M - SUB_M * m_coord
       auto n_max_coord = N - get<1>(subgroup_shape) * n_coord;                             // N - SUB_N * n_coord
@@ -290,6 +286,8 @@ class GemmUniversal<
 
       Tensor accumulators = make_tensor<ElementAccumulator>(Shape<Int<VecC>, Int<FragsM>, Int<FragsN>>{});
 
+      CollectiveMainloop collective_mma;
+
       // Perform the collective scoped MMA
       collective_mma(
         accumulators,
@@ -308,6 +306,9 @@ class GemmUniversal<
         params.scheduler, work_tile_info, accumulators);
 
       if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+        CollectiveEpilogue epilogue{params.epilogue, shared_storage.epilogue};
+        TiledMma tiled_mma;
+
         epilogue(
           problem_shape_MNKL,
           subgroup_shape,

From c06a28e52eaa04ed04eb113c0fe2439f9fad5c9a Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Thu, 26 Sep 2024 16:05:37 +0100
Subject: [PATCH 27/30] Add the option to invoke data parallel decomposition

---
 examples/sycl/pvc/pvc_gemm_streamk.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/examples/sycl/pvc/pvc_gemm_streamk.cpp b/examples/sycl/pvc/pvc_gemm_streamk.cpp
index ed98dc86d0..94a15e7a71 100644
--- a/examples/sycl/pvc/pvc_gemm_streamk.cpp
+++ b/examples/sycl/pvc/pvc_gemm_streamk.cpp
@@ -60,6 +60,7 @@ struct Options {
   bool help;
   bool error;
   bool splitk;
+  bool dp;
 
   int m, n, k, l, iterations, splits;
   float alpha, beta;
@@ -68,6 +69,7 @@ struct Options {
     help(false),
     error(false),
     splitk(false),
+    dp(false),
     m(5120), n(4096), k(4096), l(1), iterations(20), splits(1),
     alpha(1.f), beta(0.f)
   { }
@@ -85,6 +87,10 @@ struct Options {
       splitk = true;
     }
 
+    if (cmd.check_cmd_line_flag("dp")) {
+      dp = true;
+    }
+
     cmd.get_cmd_line_argument("m", m, 5120);
     cmd.get_cmd_line_argument("n", n, 4096);
     cmd.get_cmd_line_argument("k", k, 4096);
@@ -101,7 +107,8 @@ struct Options {
     out << "PVC GEMM Example\n\n"
       << "Options:\n\n"
       << "  --help                      If specified, displays this usage statement\n\n"
-      << "  --splitk                 If specified, uses SplitK decomposition\n"
+      << "  --dp                        If specified, uses Data Parallel decomposition\n"
+      << "  --splitk                    If specified, uses SplitK decomposition\n"
       << "  --m=<int>                   Sets the M extent of the GEMM\n"
       << "  --n=<int>                   Sets the N extent of the GEMM\n"
       << "  --k=<int>                   Sets the K extent of the GEMM\n"
@@ -247,6 +254,7 @@ struct ExampleRunner {
       {{options.alpha, options.beta}, block_C[0].get(), stride_C, block_D.get(), stride_D},
       hw_info,
       {options.splits, 
+      options.dp ? cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::DataParallel :
       options.splitk ? cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::SplitK :
                           cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::StreamK}
     };
@@ -281,6 +289,7 @@ struct ExampleRunner {
           {{options.alpha, options.beta}, block_C[idx].get(), stride_C, block_D.get(), stride_D},
           hw_info,
           {options.splits, 
+          options.dp ? cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::DataParallel :
           options.splitk ? cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::SplitK :
                               cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::StreamK}
         };

From 2997eca414910c1d003a9c3e9a4093e2f7410930 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Wed, 23 Oct 2024 19:25:13 +0100
Subject: [PATCH 28/30] Fixing bugs post merge

---
 examples/sycl/pvc/pvc_gemm_streamk.cpp        | 18 ++---
 include/cutlass/gemm/collective/xe_mma.hpp    | 66 ++++++++++++-------
 include/cutlass/gemm/kernel/xe_gemm.hpp       |  2 +
 .../gemm/kernel/xe_gemm_cooperative.hpp       | 46 +++++--------
 4 files changed, 71 insertions(+), 61 deletions(-)

diff --git a/examples/sycl/pvc/pvc_gemm_streamk.cpp b/examples/sycl/pvc/pvc_gemm_streamk.cpp
index 94a15e7a71..6f142f98ff 100644
--- a/examples/sycl/pvc/pvc_gemm_streamk.cpp
+++ b/examples/sycl/pvc/pvc_gemm_streamk.cpp
@@ -30,8 +30,8 @@
  **************************************************************************************************/
 
 #include "cutlass/epilogue/collective/default_epilogue.hpp"
-#include "cutlass/epilogue/collective/intel_pvc_epilogue.hpp"
-#include "cutlass/epilogue/fusion/intel_pvc_callbacks.hpp"
+#include "cutlass/epilogue/collective/xe_epilogue.hpp"
+#include "cutlass/epilogue/fusion/xe_callbacks.hpp"
 #include "cutlass/gemm/device/gemm_universal.h"
 #include "cutlass/gemm/device/gemm_universal_adapter.h"
 #include "cutlass/gemm/collective/collective_mma.hpp"
@@ -358,15 +358,15 @@ int main(int argc, const char** argv)
   using LayoutC = cutlass::layout::RowMajor;
   using LayoutD = cutlass::layout::RowMajor;
 
-  using GmemTiledCopyA = XE_2D_U16x8x16x4x2_LD_N;
-  using GmemTiledCopyB = XE_2D_U16x16x16x2x2_V;
+  using GmemTiledCopyA = XE_2D_U16x8x16_LD_N;
+  using GmemTiledCopyB = XE_2D_U16x16x16_LD_V;
 
   // Workgroup-level tile
-  using TileShape = Shape<_256, _256, _32>;
+  using TileShape = Shape<_256, _128, _16>;
 
   using TiledMma = TiledMMA<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>,
-          Layout<Shape<_1,_1,_1>>,
-          Tile<_32,_64,_32>>; // Subgroup level-tile
+          Layout<Shape<_8,_2,_1>>,
+          Tile<_64,_32,_16>>; // Subgroup level-tile
 
   constexpr int PipelineStages = 3;
   using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelPVC<PipelineStages>;
@@ -385,9 +385,9 @@ int main(int argc, const char** argv)
           ElementOutput,
           cutlass::gemm::TagToStrideC_t<LayoutD>,
           FusionCallBacks,
-          XE_2D_U32x8x16x1x1_LD_N,
+          XE_2D_U32x8x16_LD_N,
           void, void,
-          XE_2D_U32x8x16x1x1_ST_N,
+          XE_2D_U32x8x16_ST_N,
           void, void>;
 
 // Mainloop
diff --git a/include/cutlass/gemm/collective/xe_mma.hpp b/include/cutlass/gemm/collective/xe_mma.hpp
index 61fc7867fc..1df7d6e3a0 100644
--- a/include/cutlass/gemm/collective/xe_mma.hpp
+++ b/include/cutlass/gemm/collective/xe_mma.hpp
@@ -176,7 +176,8 @@ struct CollectiveMma<
     class TensorB,
     class FrgTensorC,
     class KTileIterator,
-    class ResidueMNK
+    class ResidueMNK,
+    class BlkCoord
   >
   CUTLASS_DEVICE void
   operator() (
@@ -186,6 +187,8 @@ struct CollectiveMma<
       FrgTensorC const &src_accum,
       KTileIterator k_tile_iter, int k_tile_count,
       ResidueMNK residue_mnk,
+      BlkCoord const &blk_coord,
+      int const &K,
       int thread_idx,
       char *smem_buf,
       Params const& mainloop) 
@@ -230,38 +233,53 @@ struct CollectiveMma<
     //
     // Mainloop
     //
-    const int m_coord = BlockIdxY() * BLK_M + (get_sub_group_id() / ATOM_N) * SG_M;
-    const int n_coord = BlockIdxX() * BLK_N + (get_sub_group_id() % ATOM_N) * SG_N;
-    const int l_coord = BlockIdxZ();
+
+    auto [m_idx, n_idx, k_idx, l_idx] = blk_coord;
+    const int m_coord = m_idx * BLK_M + (get_sub_group_id() / ATOM_N) * SG_M;
+    const int n_coord = n_idx * BLK_N + (get_sub_group_id() % ATOM_N) * SG_N;
+    const int l_coord = l_idx;
+
     Tensor iter_a = mainloop.gmem_tiled_copy_a.get_pvc_tensor(
-      make_coord(m_coord, 0, l_coord), make_shape(_, size<1>(tCrA_copy_view.shape()), size<2>(tCrA_copy_view.shape()), k_tile_count),
-      append<3>(typename XE_Copy_A::Shape_MN{}, BLK_K), seq<0,1,1>{});
+                    make_coord(m_coord, 0, l_coord),
+                    make_shape(_, size<1>(tCrA_copy_view.shape()), 
+                              size<2>(tCrA_copy_view.shape()), k_tile_count),
+                    append<3>(typename XE_Copy_A::Shape_MN{}, BLK_K), seq<0,1,1>{});
+    
     Tensor iter_b = mainloop.gmem_tiled_copy_b.get_pvc_tensor(
-      make_coord(0, n_coord, l_coord), make_shape(_, size<2>(tCrB_copy_view.shape()), size<1>(tCrB_copy_view.shape()), k_tile_count),
-      append<3>(typename XE_Copy_B::Shape_MN{}, BLK_K), seq<0,1,0>{});
-#pragma unroll
-    for (int i = 0; i < DispatchPolicy::Stages; i++) {
+                    make_coord(0, n_coord, l_coord),
+                    make_shape(_, size<2>(tCrB_copy_view.shape()), 
+                              size<1>(tCrB_copy_view.shape()), k_tile_count),
+                    append<3>(typename XE_Copy_B::Shape_MN{}, BLK_K), seq<0,1,0>{});
+
+    const int k_start_idx = crd2idx((*k_tile_iter), make_shape(K));
+    int prefetch_k = k_start_idx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < DispatchPolicy::Stages; i++, prefetch_k++) {
       if constexpr(cute::detail::has_prefetch<GmemTiledCopyA>) {
-        prefetch(mainloop.gmem_tiled_copy_a, iter_a(_,_,_,i));
+        prefetch(mainloop.gmem_tiled_copy_a, iter_a(_,_,_,prefetch_k));
       }
       if constexpr(cute::detail::has_prefetch<GmemTiledCopyB>) {
-        prefetch(mainloop.gmem_tiled_copy_b, iter_b(_,_,_,i));  
+        prefetch(mainloop.gmem_tiled_copy_b, iter_b(_,_,_,prefetch_k));  
       }
     }
-#pragma unroll
-    for (int k_tile = 0; k_tile < k_tile_count; ++k_tile) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile = 0, k = k_start_idx; k_tile < k_tile_count; ++k_tile, ++k, ++prefetch_k) {
       // Copy gmem to rmem for the first k_tile
-      copy(mainloop.gmem_tiled_copy_a, iter_a(_,_,_,k_tile), tCrA_copy_view);
-      copy(mainloop.gmem_tiled_copy_b, iter_b(_,_,_,k_tile), tCrB_copy_view);
-    if(k_tile + DispatchPolicy::Stages < k_tile_count) {
-      if constexpr(cute::detail::has_prefetch<GmemTiledCopyA>) {
-        prefetch(mainloop.gmem_tiled_copy_a, iter_a(_,_,_,k_tile + DispatchPolicy::Stages));
-      }
-      if constexpr(cute::detail::has_prefetch<GmemTiledCopyB>) {
-        prefetch(mainloop.gmem_tiled_copy_b, iter_b(_,_,_,k_tile + DispatchPolicy::Stages));  
+      copy(mainloop.gmem_tiled_copy_a, iter_a(_,_,_,k), tCrA_copy_view);
+      copy(mainloop.gmem_tiled_copy_b, iter_b(_,_,_,k), tCrB_copy_view);
+
+      if(prefetch_k < k_tile_count) {
+        if constexpr(cute::detail::has_prefetch<GmemTiledCopyA>) {
+          prefetch(mainloop.gmem_tiled_copy_a, iter_a(_,_,_,prefetch_k));
+        }
+        if constexpr(cute::detail::has_prefetch<GmemTiledCopyB>) {
+          prefetch(mainloop.gmem_tiled_copy_b, iter_b(_,_,_,prefetch_k));  
+        }
       }
-    }
-    cute::gemm(tiled_mma, accum, tCrA, tCrB, src_accum);
+
+      cute::gemm(tiled_mma, accum, tCrA, tCrB, src_accum);
     }
   }
 };
diff --git a/include/cutlass/gemm/kernel/xe_gemm.hpp b/include/cutlass/gemm/kernel/xe_gemm.hpp
index 54aad46709..99f98d807c 100644
--- a/include/cutlass/gemm/kernel/xe_gemm.hpp
+++ b/include/cutlass/gemm/kernel/xe_gemm.hpp
@@ -248,6 +248,8 @@ class GemmUniversal<
       accumulators,
       k_tile_iter, k_tile_count,
       residue_mnk,
+      blk_coord_mnkl,
+      K,
       thread_idx,
       smem_buf,
       params.mainloop
diff --git a/include/cutlass/gemm/kernel/xe_gemm_cooperative.hpp b/include/cutlass/gemm/kernel/xe_gemm_cooperative.hpp
index cd299ce6de..574992c5be 100644
--- a/include/cutlass/gemm/kernel/xe_gemm_cooperative.hpp
+++ b/include/cutlass/gemm/kernel/xe_gemm_cooperative.hpp
@@ -243,59 +243,50 @@ class GemmUniversal<
     auto work_tile_info = scheduler.initial_work_tile_info();
 
     int thread_idx = int(ThreadIdxX());
-    int sub_group_id = thread_idx / SubgroupSize;
     constexpr auto workgroup_shape = WorkgroupTileShape{};                                                  // (BLK_M,BLK_N,BLK_K)
     constexpr auto subgroup_shape = SubgroupTileShape{};                                                  // (SUB_M,SUB_N,SUB_K)
 
-    const int m_offset = sub_group_id / CollectiveMainloop::sg_per_wg_n * get<0>(subgroup_shape);
-    const int n_offset = sub_group_id % CollectiveMainloop::sg_per_wg_n * get<1>(subgroup_shape);
-
     while (work_tile_info.is_valid()) {
-      const int m_coord = work_tile_info.M_idx * get<0>(workgroup_shape) + m_offset;
-      const int n_coord = work_tile_info.N_idx * get<1>(workgroup_shape) + n_offset;
+      const int m_coord = work_tile_info.M_idx;
+      const int n_coord = work_tile_info.N_idx;
       const int l_coord = work_tile_info.L_idx;
+      const auto tile_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+      Tensor mA_mkl = make_tensor(make_gmem_ptr(static_cast<ElementA const*>(nullptr)), make_shape(M,K,L), StrideA{});   //(m,k,l)
+      Tensor mB_nkl = make_tensor(make_gmem_ptr(static_cast<ElementB const*>(nullptr)), make_shape(N,K,L), StrideB{});   //(n,k,l)
+      Tensor mA_mk = mA_mkl(_,_,l_coord);                                                                        // (m,k)
+      Tensor mB_nk = mB_nkl(_,_,l_coord);                                                                        // (n,k)
+
+      auto gA = local_tile(mA_mk, workgroup_shape, take<0, 3>(tile_coord), Step<_1,  X, _1>{});
+      auto gB = local_tile(mB_nk, workgroup_shape, take<0, 3>(tile_coord), Step< X, _1, _1>{});
 
       // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
       const int work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, workgroup_shape);
       const int work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
       auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, make_shape(K)), make_shape(K));
-      const auto tile_coord = make_coord(m_coord, n_coord, _, l_coord);
-
-      Tensor tAi = params.mainloop.gmem_tiled_copy_a.get_pvc_tensor(
-              make_coord(m_coord, 0, 0),
-              make_shape(_1{}, K, L),
-              make_stride(Int<FragsM>{} * get<0>(MmaAtomShape()),_1{}));
-
-      constexpr int version =
-          is_same_v<typename CollectiveMainloop::GmemTiledCopyB,
-                    XE_2D_U16x16x16x2x1_V>
-              ? 1
-              : 2;
-
-      Tensor tBi = params.mainloop.gmem_tiled_copy_b.get_pvc_tensor(
-              make_coord(n_coord, 0, 0),
-              make_shape(Int<FragsN / version>{}, K, L),
-              make_stride(Int<version * get<1>(MmaAtomShape())>{}, _1{}));
 
-      auto k_residue   = K - get<2>(subgroup_shape) * (K / get<2>(subgroup_shape));        // K - SUB_K * k_coord_max
+      auto k_residue = K - get<2>(subgroup_shape) * (K / get<2>(subgroup_shape));        // K - SUB_K * k_coord_max
 
       // Compute tile residues for predication
       auto m_max_coord = M - get<0>(subgroup_shape) * m_coord;                             // M - SUB_M * m_coord
       auto n_max_coord = N - get<1>(subgroup_shape) * n_coord;                             // N - SUB_N * n_coord
       auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue);
 
-      Tensor accumulators = make_tensor<ElementAccumulator>(Shape<Int<VecC>, Int<FragsM>, Int<FragsN>>{});
+      TiledMma tiled_mma;
+      Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(workgroup_shape)); 
 
       CollectiveMainloop collective_mma;
 
       // Perform the collective scoped MMA
       collective_mma(
         accumulators,
-        tAi(_,_,_,l_coord),
-        tBi(_,_,_,l_coord),
+        gA,
+        gB,
         accumulators,
         k_tile_iter, work_k_tile_count,
         residue_mnk,
+        tile_coord,
+        K,
         thread_idx,
         smem_buf,
         params.mainloop
@@ -307,7 +298,6 @@ class GemmUniversal<
 
       if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
         CollectiveEpilogue epilogue{params.epilogue, shared_storage.epilogue};
-        TiledMma tiled_mma;
 
         epilogue(
           problem_shape_MNKL,

From 8e20733807b11784418b5c9322aa00cc3f0a181c Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Thu, 24 Oct 2024 06:53:17 +0100
Subject: [PATCH 29/30] Address PR feedback

---
 include/cutlass/arch/barrier.h                         |  8 ++++----
 .../xe_persistent_tile_scheduler_params_streamk.hpp    | 10 ++++------
 .../cutlass/gemm/kernel/xe_tile_scheduler_streamk.hpp  |  5 ++---
 include/cutlass/workspace.h                            |  2 +-
 4 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/include/cutlass/arch/barrier.h b/include/cutlass/arch/barrier.h
index db2777e24a..0e1f344f27 100644
--- a/include/cutlass/arch/barrier.h
+++ b/include/cutlass/arch/barrier.h
@@ -37,7 +37,7 @@
 #include <cutlass/arch/memory_sm75.h>
 #include <cute/arch/cluster_sm90.hpp>
 
-#if defined SYCL_INTEL_TARGET
+#if defined(SYCL_INTEL_TARGET)
 SYCL_EXTERNAL __attribute__((convergent)) void __spirv_ControlBarrierWaitINTEL(int execution_scope, int memory_scope, int memory_semantics);
 SYCL_EXTERNAL __attribute__((convergent)) void __spirv_ControlBarrierArriveINTEL(int execution_scope, int memory_scope, int memory_semantics);
 
@@ -160,10 +160,10 @@ class NamedBarrier {
  private:
   CUTLASS_DEVICE
   static void arrive_and_wait_internal(uint32_t num_threads, uint32_t barrier_id) {
-#if defined SYCL_INTEL_TARGET
+#if defined(SYCL_INTEL_TARGET)
     __spirv_ControlBarrierArriveINTEL(EXECUTION_SCOPE_WORK_GROUP, MEMORY_SCOPE_WORK_GROUP, MEMORY_SEMANTICS_RELAXED);
     __spirv_ControlBarrierWaitINTEL(EXECUTION_SCOPE_WORK_GROUP, MEMORY_SCOPE_WORK_GROUP, MEMORY_SEMANTICS_RELAXED);
-#elif defined CUDA_BARRIER_ENABLED
+#elif CUDA_BARRIER_ENABLED
     asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "r"(num_threads));
 #elif defined(__CUDA_ARCH__)
     asm volatile ("brkpt;\n" ::);
@@ -172,7 +172,7 @@ class NamedBarrier {
 
   CUTLASS_DEVICE
   static void arrive_internal(uint32_t num_threads, uint32_t barrier_id) {
-#if defined SYCL_INTEL_TARGET
+#if defined(SYCL_INTEL_TARGET)
     __spirv_ControlBarrierArriveINTEL(EXECUTION_SCOPE_WORK_GROUP, MEMORY_SCOPE_WORK_GROUP, MEMORY_SEMANTICS_RELAXED);
 #elif CUDA_BARRIER_ENABLED
     asm volatile("bar.arrive %0, %1;" : : "r"(barrier_id), "r"(num_threads));
diff --git a/include/cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp b/include/cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp
index 443e2cf100..b98613b545 100644
--- a/include/cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp
+++ b/include/cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp
@@ -244,8 +244,6 @@ struct PersistentTileSchedulerXeStreamKParams {
 
     // Calculate the number of work units covering the data-parallel and stream-K tiles.
     // A "work unit" is a single index in the linearized ID space used by the scheduler.
-    // We distinguish it from a "block," which is typically tied to a hardware unit
-    // (e.g., the callers into this scheduler will be persistent thread blocks).
     // A work unit can encompass multiple output tiles worth of work (as will be the
     // case for stream-K blocks).
     // Since splitting is not required for data-parallel tiles, only one data-parallel unit
@@ -438,12 +436,12 @@ struct PersistentTileSchedulerXeStreamKParams {
   static uint64_t
   get_num_sk_units(uint64_t wgs_per_sk_wave, uint32_t sk_tiles, uint32_t k_tiles_per_output_tile) {
     // If there are stream-K tiles to compute and a sufficiently large number of k iterations
-    // across them, they will be covered by a single wave of persistent threadblocks. Thus, there
-    // will be as many work units as there are threadblocks in a single wave.
+    // across them, they will be covered by a single wave of persistent work_groups. Thus, there
+    // will be as many work units as there are work_groups in a single wave.
     //
     // When the total k iterations across stream-K tiles is too small to justify distributing
-    // across an entire wave of blocks, we instead distribute the iterations over a smaller
-    // set of blocks.
+    // across an entire wave of work_groups, we instead distribute the iterations over a smaller
+    // set of work_groups.
 
     // Calculate the number of stream-K units that would be needed if each stream-K unit
     // computed the minimum allowable k iterations.
diff --git a/include/cutlass/gemm/kernel/xe_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/xe_tile_scheduler_streamk.hpp
index 1caf2cba0a..4b405b39cc 100644
--- a/include/cutlass/gemm/kernel/xe_tile_scheduler_streamk.hpp
+++ b/include/cutlass/gemm/kernel/xe_tile_scheduler_streamk.hpp
@@ -245,8 +245,7 @@ class PersistentTileSchedulerXeStreamK {
     current_work_linear_idx_ += uint64_t(GridDimX()) * uint64_t(GridDimY()) * uint64_t(GridDimZ()) * uint64_t(advance_count);
   }
 
-  // Given the inputs, computes the total number of output blocks this problem will compute over
-  // Note that this is only the logical size of our grid, not the physical grid we will actually launch.
+  // Given the inputs, computes the total number of output work-groups this problem will compute over.
   template <class ProblemShape>
   CUTLASS_HOST_DEVICE static
   dim3
@@ -254,7 +253,7 @@ class PersistentTileSchedulerXeStreamK {
     return Params::get_tiled_wg_shape_mnl(to_gemm_coord(problem_shape_mnkl), to_gemm_coord(cta_shape));
   }
 
-  // Given the cluster shape, computes the physical grid we should launch.
+  // Computes the physical grid we should launch.
   template <class ProblemShape>
   CUTLASS_HOST_DEVICE static
   dim3
diff --git a/include/cutlass/workspace.h b/include/cutlass/workspace.h
index bb74826805..79f3aa3d0c 100644
--- a/include/cutlass/workspace.h
+++ b/include/cutlass/workspace.h
@@ -61,7 +61,7 @@ zero_workspace(void* workspace, size_t workspace_size, cudaStream_t stream = nul
 
     CUTLASS_TRACE_HOST("  clearing workspace");
 
-#if defined CUTLASS_ENABLE_SYCL
+#if defined (CUTLASS_ENABLE_SYCL)
     syclcompat::memset_async(workspace, 0, workspace_size);
 #elif defined(CUTLASS_ENABLE_CUDA_HOST_ADAPTER) && CUTLASS_ENABLE_CUDA_HOST_ADAPTER
     //

From af74f7a755c573f9aa2ff1f9ed33c0b0d696d419 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Thu, 7 Nov 2024 17:02:12 +0000
Subject: [PATCH 30/30] Fix tile size

---
 examples/sycl/pvc/pvc_gemm_streamk.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/sycl/pvc/pvc_gemm_streamk.cpp b/examples/sycl/pvc/pvc_gemm_streamk.cpp
index 6f142f98ff..3546ab62e3 100644
--- a/examples/sycl/pvc/pvc_gemm_streamk.cpp
+++ b/examples/sycl/pvc/pvc_gemm_streamk.cpp
@@ -358,15 +358,15 @@ int main(int argc, const char** argv)
   using LayoutC = cutlass::layout::RowMajor;
   using LayoutD = cutlass::layout::RowMajor;
 
-  using GmemTiledCopyA = XE_2D_U16x8x16_LD_N;
-  using GmemTiledCopyB = XE_2D_U16x16x16_LD_V;
+  using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
+  using GmemTiledCopyB = XE_2D_U16x32x32_LD_V;
 
   // Workgroup-level tile
-  using TileShape = Shape<_256, _128, _16>;
+  using TileShape = Shape<_256, _256, _32>;
 
   using TiledMma = TiledMMA<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>,
-          Layout<Shape<_8,_2,_1>>,
-          Tile<_64,_32,_16>>; // Subgroup level-tile
+          Layout<Shape<_8,_4,_1>>,
+          Tile<_64,_64,_32>>; // Subgroup level-tile
 
   constexpr int PipelineStages = 3;
   using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelPVC<PipelineStages>;