From 13d8b6777dc14fa6dbd48fdf95ac852b97ac570c Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 25 Mar 2024 12:57:56 +0530 Subject: [PATCH 01/21] block store --- .../dpct/dpl_extras/dpcpp_extensions.h | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 5ea603ad53b1..8548f37dab7a 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -566,6 +566,97 @@ class radix_sort { uint8_t *_local_memory; }; +/// Store blocked/warped or striped work items into linear segment of items. +/// Helper for Block Store +enum store_algorithm { + + BLOCK_STORE_DIRECT, + BLOCK_STORE_STRIPED, + // To-do: BLOCK_STORE_WARP_TRANSPOSE + // To-do: BLOCK_STORE_VECTORIZE + +}; + +/// Stores a blocked arrangement of work items linear segment of items. +template +__dpct_inline__ void store_blocked(const Item &item, OutputIteratorT block_itr, + InputT (&items)[ITEMS_PER_WORK_ITEM]) { + + // This implementation does not take in account range storage across + // workgroup items To-do: Decide whether range storage is required for group + // storage + size_t linear_tid = item.get_local_linear_id(); + OutputIteratorT workitem_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); +#pragma unroll + for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + workitem_itr[idx] = items[idx]; + } +} + +/// Stores a striped arrangement of work items linear segment of items. +template +__dpct_inline__ void store_striped(const Item &item, OutputIteratorT block_itr, + InputT (&items)[ITEMS_PER_WORK_ITEM]) { + + // This implementation does not take in account range storage across + // workgroup items To-do: Decide whether range storage is required for group + // storage + size_t linear_tid = item.get_local_linear_id(); + OutputIteratorT workitem_itr = block_itr + linear_tid; + size_t GROUP_WORK_ITEMS = item.get_global_range(); +#pragma unroll + for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + workitem_itr[(idx * GROUP_WORK_ITEMS)] = items[idx]; + } +} + +/// Stores a warp-striped arrangement of work items linear segment of items. +// Created as free function until exchange mechanism is +// implemented. +// To-do: inline this function with BLOCK_STORE_WARP_TRANSPOSE mechanism +template +__dpct_inline__ void +store_subgroup_striped(const Item &item, OutputIteratorT block_itr, + InputT (&items)[ITEMS_PER_WORK_ITEM]) { + + // This implementation does not take in account range loading across + // workgroup items To-do: Decide whether range loading is required for group + // loading + // This implementation uses unintialized memory for loading linear segments + // into warp striped arrangement. + uint32_t subgroup_offset = item.get_sub_group().get_local_linear_id(); + uint32_t subgroup_size = item.get_sub_group().get_local_linear_range(); + uint32_t subgroup_idx = item.get_sub_group().get_group_linear_id(); + uint32_t initial_offset = + (subgroup_idx * ITEMS_PER_WORK_ITEM * subgroup_size) + subgroup_offset; + OutputIteratorT workitem_itr = block_itr + initial_offset; +#pragma unroll + for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + workitem_itr[(idx * subgroup_size)] = items[idx]; + } +} + +template +class workgroup_store { + static size_t get_local_memory_size(size_t group_threads) { + return (group_threads * ITEMS_PER_WORK_ITEM) * sizeof(T); + } + + __dpct_inline__ void store(const Item &item, OutputIteratorT block_itr, + InputT (&items)[ITEMS_PER_WORK_ITEM]) { + + if constexpr (ALGORITHM == BLOCK_STORE_DIRECT) { + store_blocked(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + } else if constexpr (ALGORITHM == BLOCK_STORE_STRIPED) { + store_striped(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + } + } +}; + /// Perform a reduction of the data elements assigned to all threads in the /// group. /// From 7517519a4c0881a26bfaa8a597841371b6ca2b0c Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 25 Mar 2024 14:54:12 +0530 Subject: [PATCH 02/21] fix bug --- .../runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 8548f37dab7a..fa68a6e35742 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -587,7 +587,7 @@ __dpct_inline__ void store_blocked(const Item &item, OutputIteratorT block_itr, // workgroup items To-do: Decide whether range storage is required for group // storage size_t linear_tid = item.get_local_linear_id(); - OutputIteratorT workitem_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); + OutputIteratorT workitem_itr = block_itr + (linear_tid * ITEMS_PER_WORK_ITEM); #pragma unroll for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { workitem_itr[idx] = items[idx]; From 6b7fd09dce9e2c1c9f68367a6eca85d3b641a1fa Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Fri, 10 May 2024 12:22:15 +0530 Subject: [PATCH 03/21] update code --- .../include/dpct/dpl_extras/dpcpp_extensions.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index fa68a6e35742..55c73c87937f 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -605,7 +605,7 @@ __dpct_inline__ void store_striped(const Item &item, OutputIteratorT block_itr, // storage size_t linear_tid = item.get_local_linear_id(); OutputIteratorT workitem_itr = block_itr + linear_tid; - size_t GROUP_WORK_ITEMS = item.get_global_range(); + size_t GROUP_WORK_ITEMS = item.get_global_range().size(); #pragma unroll for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { workitem_itr[(idx * GROUP_WORK_ITEMS)] = items[idx]; @@ -639,6 +639,13 @@ store_subgroup_striped(const Item &item, OutputIteratorT block_itr, } } +// template parameters : +// ITEMS_PER_WORK_ITEM: size_t variable controlling the number of items per +// thread/work_item +// ALGORITHM: store_algorithm variable controlling the type of store operation. +// InputT: type for input sequence. +// OutputIteratorT: output iterator type +// Item : typename parameter resembling sycl::nd_item<3> . template class workgroup_store { @@ -650,9 +657,9 @@ class workgroup_store { InputT (&items)[ITEMS_PER_WORK_ITEM]) { if constexpr (ALGORITHM == BLOCK_STORE_DIRECT) { - store_blocked(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + store_blocked(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } else if constexpr (ALGORITHM == BLOCK_STORE_STRIPED) { - store_striped(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + store_striped(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } } }; From 454c4538278759322684e373773c45bb060cdbd9 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Fri, 10 May 2024 13:33:30 +0530 Subject: [PATCH 04/21] fix template param --- .../include/dpct/dpl_extras/dpcpp_extensions.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index 55c73c87937f..00b95caa6e53 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -647,11 +647,11 @@ store_subgroup_striped(const Item &item, OutputIteratorT block_itr, // OutputIteratorT: output iterator type // Item : typename parameter resembling sycl::nd_item<3> . template + typename OutputIteratorT, typename Item> class workgroup_store { - static size_t get_local_memory_size(size_t group_threads) { - return (group_threads * ITEMS_PER_WORK_ITEM) * sizeof(T); - } +public: + static size_t get_local_memory_size(size_t group_work_items) { return 0; } + workgroup_store(uint8_t *local_memory) : _local_memory(local_memory) {} __dpct_inline__ void store(const Item &item, OutputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -662,6 +662,9 @@ class workgroup_store { store_striped(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } } + +private: + uint8_t *_local_memory; }; /// Perform a reduction of the data elements assigned to all threads in the From ffbd181ea15184dc8f3fa8ce47594a8bd52d4db8 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Tue, 14 May 2024 08:50:16 +0530 Subject: [PATCH 05/21] fix error --- .../dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index d0b3dfb57535..9d3bee39ae05 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -679,7 +679,7 @@ enum store_algorithm { }; /// Stores a blocked arrangement of work items linear segment of items. -template __dpct_inline__ void store_blocked(const Item &item, OutputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -696,7 +696,7 @@ __dpct_inline__ void store_blocked(const Item &item, OutputIteratorT block_itr, } /// Stores a striped arrangement of work items linear segment of items. -template __dpct_inline__ void store_striped(const Item &item, OutputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -766,9 +766,7 @@ class workgroup_store { private: uint8_t *_local_memory; - }; - /// Perform a reduction of the data elements assigned to all threads in the /// group. /// From 49147b8eac06a342708182e815ce983dd2523c22 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Wed, 29 May 2024 23:14:13 -0700 Subject: [PATCH 06/21] add in group_utils --- .../dpct/dpl_extras/dpcpp_extensions.h | 101 ----------------- .../dpct-rt/include/dpct/group_utils.hpp | 102 ++++++++++++++++++ 2 files changed, 102 insertions(+), 101 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h index eb427857761c..d8472c5c275e 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/dpcpp_extensions.h @@ -149,107 +149,6 @@ exclusive_scan(const Item &item, T input, BinaryOperation binary_op, return output; } -/// Store blocked/warped or striped work items into linear segment of items. -/// Helper for Block Store -enum store_algorithm { - - BLOCK_STORE_DIRECT, - BLOCK_STORE_STRIPED, - // To-do: BLOCK_STORE_WARP_TRANSPOSE - // To-do: BLOCK_STORE_VECTORIZE - -}; - -/// Stores a blocked arrangement of work items linear segment of items. -template -__dpct_inline__ void store_blocked(const Item &item, OutputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]) { - - // This implementation does not take in account range storage across - // workgroup items To-do: Decide whether range storage is required for group - // storage - size_t linear_tid = item.get_local_linear_id(); - OutputIteratorT workitem_itr = block_itr + (linear_tid * ITEMS_PER_WORK_ITEM); -#pragma unroll - for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { - workitem_itr[idx] = items[idx]; - } -} - -/// Stores a striped arrangement of work items linear segment of items. -template -__dpct_inline__ void store_striped(const Item &item, OutputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]) { - - // This implementation does not take in account range storage across - // workgroup items To-do: Decide whether range storage is required for group - // storage - size_t linear_tid = item.get_local_linear_id(); - OutputIteratorT workitem_itr = block_itr + linear_tid; - size_t GROUP_WORK_ITEMS = item.get_global_range().size(); -#pragma unroll - for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { - workitem_itr[(idx * GROUP_WORK_ITEMS)] = items[idx]; - } -} - -/// Stores a warp-striped arrangement of work items linear segment of items. -// Created as free function until exchange mechanism is -// implemented. -// To-do: inline this function with BLOCK_STORE_WARP_TRANSPOSE mechanism -template -__dpct_inline__ void -store_subgroup_striped(const Item &item, OutputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]) { - - // This implementation does not take in account range loading across - // workgroup items To-do: Decide whether range loading is required for group - // loading - // This implementation uses unintialized memory for loading linear segments - // into warp striped arrangement. - uint32_t subgroup_offset = item.get_sub_group().get_local_linear_id(); - uint32_t subgroup_size = item.get_sub_group().get_local_linear_range(); - uint32_t subgroup_idx = item.get_sub_group().get_group_linear_id(); - uint32_t initial_offset = - (subgroup_idx * ITEMS_PER_WORK_ITEM * subgroup_size) + subgroup_offset; - OutputIteratorT workitem_itr = block_itr + initial_offset; -#pragma unroll - for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { - workitem_itr[(idx * subgroup_size)] = items[idx]; - } -} - -// template parameters : -// ITEMS_PER_WORK_ITEM: size_t variable controlling the number of items per -// thread/work_item -// ALGORITHM: store_algorithm variable controlling the type of store operation. -// InputT: type for input sequence. -// OutputIteratorT: output iterator type -// Item : typename parameter resembling sycl::nd_item<3> . -template -class workgroup_store { -public: - static size_t get_local_memory_size(size_t group_work_items) { return 0; } - workgroup_store(uint8_t *local_memory) : _local_memory(local_memory) {} - - __dpct_inline__ void store(const Item &item, OutputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]) { - - if constexpr (ALGORITHM == BLOCK_STORE_DIRECT) { - store_blocked(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); - } else if constexpr (ALGORITHM == BLOCK_STORE_STRIPED) { - store_striped(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); - } - } - -private: - uint8_t *_local_memory; -}; - /// Perform a reduction of the data elements assigned to all threads in the /// group. /// diff --git a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp index 9f0be34ab87f..dc4e9f746216 100644 --- a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp +++ b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp @@ -527,6 +527,108 @@ class workgroup_load { private: uint8_t *_local_memory; }; + +/// Store blocked/warped or striped work items into linear segment of items. +/// Helper for Block Store +enum store_algorithm { + + BLOCK_STORE_DIRECT, + BLOCK_STORE_STRIPED, + // To-do: BLOCK_STORE_WARP_TRANSPOSE + // To-do: BLOCK_STORE_VECTORIZE + +}; + +/// Stores a blocked arrangement of work items linear segment of items. +template +__dpct_inline__ void store_blocked(const Item &item, OutputIteratorT block_itr, + InputT (&items)[ITEMS_PER_WORK_ITEM]) { + + // This implementation does not take in account range storage across + // workgroup items To-do: Decide whether range storage is required for group + // storage + size_t linear_tid = item.get_local_linear_id(); + OutputIteratorT workitem_itr = block_itr + (linear_tid * ITEMS_PER_WORK_ITEM); +#pragma unroll + for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + workitem_itr[idx] = items[idx]; + } +} + +/// Stores a striped arrangement of work items linear segment of items. +template +__dpct_inline__ void store_striped(const Item &item, OutputIteratorT block_itr, + InputT (&items)[ITEMS_PER_WORK_ITEM]) { + + // This implementation does not take in account range storage across + // workgroup items To-do: Decide whether range storage is required for group + // storage + size_t linear_tid = item.get_local_linear_id(); + OutputIteratorT workitem_itr = block_itr + linear_tid; + size_t GROUP_WORK_ITEMS = item.get_global_range().size(); +#pragma unroll + for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + workitem_itr[(idx * GROUP_WORK_ITEMS)] = items[idx]; + } +} + +/// Stores a warp-striped arrangement of work items linear segment of items. +// Created as free function until exchange mechanism is +// implemented. +// To-do: inline this function with BLOCK_STORE_WARP_TRANSPOSE mechanism +template +__dpct_inline__ void +store_subgroup_striped(const Item &item, OutputIteratorT block_itr, + InputT (&items)[ITEMS_PER_WORK_ITEM]) { + + // This implementation does not take in account range loading across + // workgroup items To-do: Decide whether range loading is required for group + // loading + // This implementation uses unintialized memory for loading linear segments + // into warp striped arrangement. + uint32_t subgroup_offset = item.get_sub_group().get_local_linear_id(); + uint32_t subgroup_size = item.get_sub_group().get_local_linear_range(); + uint32_t subgroup_idx = item.get_sub_group().get_group_linear_id(); + uint32_t initial_offset = + (subgroup_idx * ITEMS_PER_WORK_ITEM * subgroup_size) + subgroup_offset; + OutputIteratorT workitem_itr = block_itr + initial_offset; +#pragma unroll + for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { + workitem_itr[(idx * subgroup_size)] = items[idx]; + } +} + +// template parameters : +// ITEMS_PER_WORK_ITEM: size_t variable controlling the number of items per +// thread/work_item +// ALGORITHM: store_algorithm variable controlling the type of store operation. +// InputT: type for input sequence. +// OutputIteratorT: output iterator type +// Item : typename parameter resembling sycl::nd_item<3> . +template +class workgroup_store { +public: + static size_t get_local_memory_size(size_t group_work_items) { return 0; } + workgroup_store(uint8_t *local_memory) : _local_memory(local_memory) {} + + __dpct_inline__ void store(const Item &item, OutputIteratorT block_itr, + InputT (&items)[ITEMS_PER_WORK_ITEM]) { + + if constexpr (ALGORITHM == BLOCK_STORE_DIRECT) { + store_blocked(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + } else if constexpr (ALGORITHM == BLOCK_STORE_STRIPED) { + store_striped(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + } + } + +private: + uint8_t *_local_memory; +}; + } // namespace group } // namespace dpct From 18f826ad03a81739d1547a9fef396d098f629974 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Wed, 29 May 2024 23:53:48 -0700 Subject: [PATCH 07/21] use class --- clang/runtime/dpct-rt/include/dpct/group_utils.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp index dc4e9f746216..e7586af6e8de 100644 --- a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp +++ b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp @@ -433,7 +433,7 @@ class radix_sort { /// Load linear segment items into block format across threads /// Helper for Block Load -enum load_algorithm { +enum class load_algorithm { BLOCK_LOAD_DIRECT, BLOCK_LOAD_STRIPED, @@ -517,9 +517,9 @@ class workgroup_load { __dpct_inline__ void load(const Item &item, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { - if constexpr (ALGORITHM == BLOCK_LOAD_DIRECT) { + if constexpr (ALGORITHM == load_algorithm::BLOCK_LOAD_DIRECT) { load_blocked(item, block_itr, items); - } else if constexpr (ALGORITHM == BLOCK_LOAD_STRIPED) { + } else if constexpr (ALGORITHM == load_algorithm::BLOCK_LOAD_STRIPED) { load_striped(item, block_itr, items); } } @@ -530,7 +530,7 @@ class workgroup_load { /// Store blocked/warped or striped work items into linear segment of items. /// Helper for Block Store -enum store_algorithm { +enum class store_algorithm { BLOCK_STORE_DIRECT, BLOCK_STORE_STRIPED, @@ -618,9 +618,9 @@ class workgroup_store { __dpct_inline__ void store(const Item &item, OutputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { - if constexpr (ALGORITHM == BLOCK_STORE_DIRECT) { + if constexpr (ALGORITHM == store_algorithm::BLOCK_STORE_DIRECT) { store_blocked(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); - } else if constexpr (ALGORITHM == BLOCK_STORE_STRIPED) { + } else if constexpr (ALGORITHM == store_algorithm::BLOCK_STORE_STRIPED) { store_striped(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); } } From 7149372c895c3e4e03a0b4a2875979f96a7ac10f Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Wed, 29 May 2024 23:58:22 -0700 Subject: [PATCH 08/21] review commit --- clang/runtime/dpct-rt/include/dpct/group_utils.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp index e7586af6e8de..dc6df547cd5b 100644 --- a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp +++ b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp @@ -437,7 +437,6 @@ enum class load_algorithm { BLOCK_LOAD_DIRECT, BLOCK_LOAD_STRIPED, - // To-do: BLOCK_LOAD_WARP_TRANSPOSE }; @@ -534,8 +533,6 @@ enum class store_algorithm { BLOCK_STORE_DIRECT, BLOCK_STORE_STRIPED, - // To-do: BLOCK_STORE_WARP_TRANSPOSE - // To-do: BLOCK_STORE_VECTORIZE }; From 431d4a42052a279cf609e0e385d3cc18d6281756 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Thu, 30 May 2024 00:14:11 -0700 Subject: [PATCH 09/21] format --- .../dpct-rt/include/dpct/group_utils.hpp | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp index dc6df547cd5b..ce480b3bdd38 100644 --- a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp +++ b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp @@ -537,10 +537,10 @@ enum class store_algorithm { }; /// Stores a blocked arrangement of work items linear segment of items. -template +template __dpct_inline__ void store_blocked(const Item &item, OutputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]) { + InputT (&items)[ITEMS_PER_WORK_ITEM]) { // This implementation does not take in account range storage across // workgroup items To-do: Decide whether range storage is required for group @@ -557,7 +557,7 @@ __dpct_inline__ void store_blocked(const Item &item, OutputIteratorT block_itr, template __dpct_inline__ void store_striped(const Item &item, OutputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]) { + InputT (&items)[ITEMS_PER_WORK_ITEM]) { // This implementation does not take in account range storage across // workgroup items To-do: Decide whether range storage is required for group @@ -579,7 +579,7 @@ template __dpct_inline__ void store_subgroup_striped(const Item &item, OutputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]) { + InputT (&items)[ITEMS_PER_WORK_ITEM]) { // This implementation does not take in account range loading across // workgroup items To-do: Decide whether range loading is required for group @@ -605,20 +605,22 @@ store_subgroup_striped(const Item &item, OutputIteratorT block_itr, // InputT: type for input sequence. // OutputIteratorT: output iterator type // Item : typename parameter resembling sycl::nd_item<3> . -template +template class workgroup_store { public: static size_t get_local_memory_size(size_t group_work_items) { return 0; } workgroup_store(uint8_t *local_memory) : _local_memory(local_memory) {} __dpct_inline__ void store(const Item &item, OutputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]) { + InputT (&items)[ITEMS_PER_WORK_ITEM]) { if constexpr (ALGORITHM == store_algorithm::BLOCK_STORE_DIRECT) { - store_blocked(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + store_blocked(item, block_itr, + (&items)[ITEMS_PER_WORK_ITEM]); } else if constexpr (ALGORITHM == store_algorithm::BLOCK_STORE_STRIPED) { - store_striped(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]); + store_striped(item, block_itr, + (&items)[ITEMS_PER_WORK_ITEM]); } } From 8cc73f11ee60f0d009c98a44498cf4999c23edc0 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 6 Jun 2024 09:51:31 +0530 Subject: [PATCH 10/21] review commit --- clang/runtime/dpct-rt/include/dpct/group_utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp index ce480b3bdd38..5ead421b0b13 100644 --- a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp +++ b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp @@ -564,7 +564,7 @@ __dpct_inline__ void store_striped(const Item &item, OutputIteratorT block_itr, // storage size_t linear_tid = item.get_local_linear_id(); OutputIteratorT workitem_itr = block_itr + linear_tid; - size_t GROUP_WORK_ITEMS = item.get_global_range().size(); + size_t GROUP_WORK_ITEMS = item.get_local_range().size(); #pragma unroll for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { workitem_itr[(idx * GROUP_WORK_ITEMS)] = items[idx]; From 98d019305a463c9403491cdaa39a22026f56acf4 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 4 Jul 2024 09:19:46 +0530 Subject: [PATCH 11/21] clang-format --- clang/runtime/dpct-rt/include/dpct/group_utils.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp index 5ead421b0b13..dafe8f6b5097 100644 --- a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp +++ b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp @@ -554,8 +554,8 @@ __dpct_inline__ void store_blocked(const Item &item, OutputIteratorT block_itr, } /// Stores a striped arrangement of work items linear segment of items. -template +template __dpct_inline__ void store_striped(const Item &item, OutputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { From c4fe03577cdb9e0027470982150d27d57815b99c Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 11 Jul 2024 19:53:37 +0530 Subject: [PATCH 12/21] reorder template args for better visibility in parsing --- clang/runtime/dpct-rt/include/dpct/group_utils.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp index dafe8f6b5097..3146b131d606 100644 --- a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp +++ b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp @@ -537,7 +537,7 @@ enum class store_algorithm { }; /// Stores a blocked arrangement of work items linear segment of items. -template __dpct_inline__ void store_blocked(const Item &item, OutputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -554,7 +554,7 @@ __dpct_inline__ void store_blocked(const Item &item, OutputIteratorT block_itr, } /// Stores a striped arrangement of work items linear segment of items. -template __dpct_inline__ void store_striped(const Item &item, OutputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -575,7 +575,7 @@ __dpct_inline__ void store_striped(const Item &item, OutputIteratorT block_itr, // Created as free function until exchange mechanism is // implemented. // To-do: inline this function with BLOCK_STORE_WARP_TRANSPOSE mechanism -template __dpct_inline__ void store_subgroup_striped(const Item &item, OutputIteratorT block_itr, From 76ec68433875b3fc37ff1b8159a528d397352788 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 12 Aug 2024 11:57:55 +0530 Subject: [PATCH 13/21] revert template alignment --- clang/runtime/dpct-rt/include/dpct/group_utils.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp index 3146b131d606..dafe8f6b5097 100644 --- a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp +++ b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp @@ -537,7 +537,7 @@ enum class store_algorithm { }; /// Stores a blocked arrangement of work items linear segment of items. -template __dpct_inline__ void store_blocked(const Item &item, OutputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -554,7 +554,7 @@ __dpct_inline__ void store_blocked(const Item &item, OutputIteratorT block_itr, } /// Stores a striped arrangement of work items linear segment of items. -template __dpct_inline__ void store_striped(const Item &item, OutputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { @@ -575,7 +575,7 @@ __dpct_inline__ void store_striped(const Item &item, OutputIteratorT block_itr, // Created as free function until exchange mechanism is // implemented. // To-do: inline this function with BLOCK_STORE_WARP_TRANSPOSE mechanism -template __dpct_inline__ void store_subgroup_striped(const Item &item, OutputIteratorT block_itr, From 41b1c8a4827e9fa07a49311cf9fe99accf4a3ea2 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 12 Aug 2024 20:22:36 +0530 Subject: [PATCH 14/21] fix temps pointer --- clang/runtime/dpct-rt/include/dpct/group_utils.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp index dafe8f6b5097..db8534b1bd1e 100644 --- a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp +++ b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp @@ -616,11 +616,9 @@ class workgroup_store { InputT (&items)[ITEMS_PER_WORK_ITEM]) { if constexpr (ALGORITHM == store_algorithm::BLOCK_STORE_DIRECT) { - store_blocked(item, block_itr, - (&items)[ITEMS_PER_WORK_ITEM]); + store_blocked(item, block_itr, items); } else if constexpr (ALGORITHM == store_algorithm::BLOCK_STORE_STRIPED) { - store_striped(item, block_itr, - (&items)[ITEMS_PER_WORK_ITEM]); + store_striped(item, block_itr, items); } } From b046dcc0cf8c7a2a97d37bc460ee30cc0ca6bdbb Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Wed, 21 Aug 2024 20:04:36 +0530 Subject: [PATCH 15/21] rectify comment --- clang/runtime/dpct-rt/include/dpct/group_utils.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp index db8534b1bd1e..7ae619440315 100644 --- a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp +++ b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp @@ -584,8 +584,7 @@ store_subgroup_striped(const Item &item, OutputIteratorT block_itr, // This implementation does not take in account range loading across // workgroup items To-do: Decide whether range loading is required for group // loading - // This implementation uses unintialized memory for loading linear segments - // into warp striped arrangement. + // This implementation loads linear segments into warp striped arrangement. uint32_t subgroup_offset = item.get_sub_group().get_local_linear_id(); uint32_t subgroup_size = item.get_sub_group().get_local_linear_range(); uint32_t subgroup_idx = item.get_sub_group().get_group_linear_id(); From 273d09804c2427d6b7a437197664a59ad06c0caf Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 22 Aug 2024 14:09:18 +0530 Subject: [PATCH 16/21] Update clang/runtime/dpct-rt/include/dpct/group_utils.hpp Co-authored-by: Dan Hoeflinger <109972525+danhoeflinger@users.noreply.github.com> --- clang/runtime/dpct-rt/include/dpct/group_utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp index a77484fad0a0..172066304370 100644 --- a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp +++ b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp @@ -897,7 +897,7 @@ __dpct_inline__ void store_striped(const Item &item, OutputIteratorT block_itr, } } -/// Stores a warp-striped arrangement of work items linear segment of items. +/// Stores a subgroup-striped arrangement of work items linear segment of items. // Created as free function until exchange mechanism is // implemented. // To-do: inline this function with BLOCK_STORE_WARP_TRANSPOSE mechanism From 3185ceb3b9e52ad118d1f569315f3b38c26f9e73 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 22 Aug 2024 14:18:31 +0530 Subject: [PATCH 17/21] Update group_utils.hpp --- clang/runtime/dpct-rt/include/dpct/group_utils.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp index 172066304370..ee068f940d76 100644 --- a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp +++ b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp @@ -890,10 +890,10 @@ __dpct_inline__ void store_striped(const Item &item, OutputIteratorT block_itr, // storage size_t linear_tid = item.get_local_linear_id(); OutputIteratorT workitem_itr = block_itr + linear_tid; - size_t GROUP_WORK_ITEMS = item.get_local_range().size(); + size_t group_work_items = item.get_local_range().size(); #pragma unroll for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { - workitem_itr[(idx * GROUP_WORK_ITEMS)] = items[idx]; + workitem_itr[(idx * group_work_items)] = items[idx]; } } From cc004033eb7a7ad3461fa77677a32b525fb71835 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 22 Aug 2024 14:45:53 +0530 Subject: [PATCH 18/21] fix review comments --- clang/runtime/dpct-rt/include/dpct/group_utils.hpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp index ee068f940d76..b0a7aa0052fa 100644 --- a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp +++ b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp @@ -907,13 +907,14 @@ __dpct_inline__ void store_subgroup_striped(const Item &item, OutputIteratorT block_itr, InputT (&items)[ITEMS_PER_WORK_ITEM]) { - // This implementation does not take in account range loading across - // workgroup items To-do: Decide whether range loading is required for group + // This implementation does not take in account range storing across + // workgroup items To-do: Decide whether range storing is required for group // loading // This implementation loads linear segments into warp striped arrangement. - uint32_t subgroup_offset = item.get_sub_group().get_local_linear_id(); - uint32_t subgroup_size = item.get_sub_group().get_local_linear_range(); - uint32_t subgroup_idx = item.get_sub_group().get_group_linear_id(); + auto sub_group = item.get_subgroup(); + uint32_t subgroup_offset = sub_group.get_local_linear_id(); + uint32_t subgroup_size = sub_group.get_local_linear_range(); + uint32_t subgroup_idx = sub_group.get_group_linear_id(); uint32_t initial_offset = (subgroup_idx * ITEMS_PER_WORK_ITEM * subgroup_size) + subgroup_offset; OutputIteratorT workitem_itr = block_itr + initial_offset; @@ -948,6 +949,9 @@ class workgroup_store { } private: + // local_memory is a placeholder ,currently unused, as no operations use + // extra memory but placed here to make migrations easier. + // For future exchange operations might be necessary uint8_t *_local_memory; }; From 28ff8682f32bab2d9079ec78de90e819bd9fe270 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 26 Aug 2024 11:44:44 +0530 Subject: [PATCH 19/21] fix --- clang/runtime/dpct-rt/include/dpct/group_utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp index e440bc9b68e2..1c06f2734ca8 100644 --- a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp +++ b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp @@ -712,7 +712,7 @@ enum class load_algorithm { BLOCK_LOAD_DIRECT, BLOCK_LOAD_STRIPED, - +}; // loads a linear segment of workgroup items into a blocked arrangement. template From e87c0a635b0ef7355d55badebefd032e7002b329 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 26 Aug 2024 11:46:46 +0530 Subject: [PATCH 20/21] Update clang/runtime/dpct-rt/include/dpct/group_utils.hpp Co-authored-by: Dan Hoeflinger <109972525+danhoeflinger@users.noreply.github.com> --- clang/runtime/dpct-rt/include/dpct/group_utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp index 1c06f2734ca8..f8f563e0544d 100644 --- a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp +++ b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp @@ -883,7 +883,7 @@ store_subgroup_striped(const Item &item, OutputIteratorT block_itr, // This implementation does not take in account range storing across // workgroup items To-do: Decide whether range storing is required for group // loading - // This implementation loads linear segments into warp striped arrangement. + // This implementation loads linear segments into subgroup striped arrangement. auto sub_group = item.get_sub_group(); uint32_t subgroup_offset = sub_group.get_local_linear_id(); uint32_t subgroup_size = sub_group.get_local_linear_range(); From 1802fbe089c1166b6114de230b30ff91fcde34b5 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 26 Aug 2024 18:57:30 +0530 Subject: [PATCH 21/21] update correct variables --- clang/runtime/dpct-rt/include/dpct/group_utils.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp index f8f563e0544d..1efc70abe945 100644 --- a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp +++ b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp @@ -874,11 +874,11 @@ uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr, // Created as free function until exchange mechanism is // implemented. // To-do: inline this function with BLOCK_STORE_WARP_TRANSPOSE mechanism -template +template __dpct_inline__ void -store_subgroup_striped(const Item &item, OutputIteratorT block_itr, - InputT (&items)[ITEMS_PER_WORK_ITEM]) { +store_subgroup_striped(const ItemT &item, OutputIteratorT block_itr, + T (&data)[ElementsPerWorkItem]) { // This implementation does not take in account range storing across // workgroup items To-do: Decide whether range storing is required for group @@ -889,11 +889,11 @@ store_subgroup_striped(const Item &item, OutputIteratorT block_itr, uint32_t subgroup_size = sub_group.get_local_linear_range(); uint32_t subgroup_idx = sub_group.get_group_linear_id(); uint32_t initial_offset = - (subgroup_idx * ITEMS_PER_WORK_ITEM * subgroup_size) + subgroup_offset; + (subgroup_idx * ElementsPerWorkItem * subgroup_size) + subgroup_offset; OutputIteratorT workitem_itr = block_itr + initial_offset; #pragma unroll - for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { - workitem_itr[(idx * subgroup_size)] = items[idx]; + for (uint32_t idx = 0; idx < ElementsPerWorkItem; idx++) { + workitem_itr[(idx * subgroup_size)] = data[idx]; } }