From 6abb9fe1592739515e0e8577158c3ee691cef892 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 15 Dec 2023 09:43:24 -0600 Subject: [PATCH] Changed inclusive sum implementation from recursive to iterative MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changed hyperparameter choices to be different for CPU and GPU, resulting in 20% performance gain on GPU. The non-recursive implementation allows to avoid repeated USM allocations, resulting in performance gains for large arrays. Furthermore, corrected base step kernel to accumulate in outputT rather than in size_t, which additionally realizes savings when int32 is used as accumulator type. Using example from gh-1249, previously, on my Iris Xe laptop: ``` In [1]: import dpctl.tensor as dpt ...: ag = dpt.ones((8192, 8192), device='gpu', dtype='f4') ...: bg = dpt.ones((8192, 8192), device='gpu', dtype=bool) In [2]: cg = ag[bg] In [3]: dpt.all(cg == dpt.reshape(ag, -1)) Out[3]: usm_ndarray(True) In [4]: %timeit -n 10 -r 3 cg = ag[bg] 212 ms ± 56 ms per loop (mean ± std. dev. of 3 runs, 10 loops each) ``` while with this change: ``` In [4]: %timeit -n 10 -r 3 cg = ag[bg] 178 ms ± 24.2 ms per loop (mean ± std. dev. of 3 runs, 10 loops each) ``` --- .../include/kernels/accumulators.hpp | 375 ++++++++++++------ 1 file changed, 258 insertions(+), 117 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index da461d56b0..491fb12126 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -51,23 +51,6 @@ template T ceiling_quotient(T n, T m) { return (n + m - 1) / m; } -template T1 ceiling_quotient(T1 n, T2 m) -{ - return ceiling_quotient(n, static_cast(m)); -} - -template -class inclusive_scan_rec_local_scan_krn; - -template -class inclusive_scan_rec_chunk_update_krn; template struct NonZeroIndicator { @@ -93,129 +76,261 @@ template struct NoOpTransformer } }; -/* - * for integer type maskT, - * output[j] = sum( input[s0 + i * s1], 0 <= i <= j) - * for 0 <= j < n_elems - */ +// Iterative cumulative summation + +using nwiT = std::uint16_t; + template -sycl::event inclusive_scan_rec(sycl::queue &exec_q, - size_t n_elems, - size_t wg_size, - const inputT *input, - outputT *output, - size_t s0, - size_t s1, - IndexerT indexer, - TransformerT transformer, - std::vector &host_tasks, - const std::vector &depends = {}) +class inclusive_scan_iter_local_scan_krn; + +template +class inclusive_scan_iter_chunk_update_krn; + +template +sycl::event +inclusive_scan_base_step(sycl::queue &exec_q, + const size_t wg_size, + const size_t n_elems, + const inputT *input, + outputT *output, + const size_t s0, + const size_t s1, + IndexerT indexer, + TransformerT transformer, + size_t &n_groups, + const std::vector &depends = {}) { - size_t n_groups = ceiling_quotient(n_elems, n_wi * wg_size); + n_groups = ceiling_quotient(n_elems, n_wi * wg_size); - const sycl::event &inc_scan_phase1_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); + sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); - using slmT = sycl::local_accessor; + using slmT = sycl::local_accessor; - auto lws = sycl::range<1>(wg_size); - auto gws = sycl::range<1>(n_groups * wg_size); + auto lws = sycl::range<1>(wg_size); + auto gws = sycl::range<1>(n_groups * wg_size); - auto ndRange = sycl::nd_range<1>(gws, lws); + auto ndRange = sycl::nd_range<1>(gws, lws); - slmT slm_iscan_tmp(lws, cgh); + slmT slm_iscan_tmp(lws, cgh); - using KernelName = inclusive_scan_rec_local_scan_krn< - inputT, outputT, n_wi, IndexerT, decltype(transformer)>; + using KernelName = + inclusive_scan_iter_local_scan_krn; - cgh.parallel_for(ndRange, [=, slm_iscan_tmp = std::move( - slm_iscan_tmp)]( - sycl::nd_item<1> it) { - auto chunk_gid = it.get_global_id(0); - auto lid = it.get_local_id(0); + cgh.parallel_for(ndRange, [=, slm_iscan_tmp = + std::move(slm_iscan_tmp)]( + sycl::nd_item<1> it) { + auto chunk_gid = it.get_global_id(0); + auto lid = it.get_local_id(0); - std::array local_isum; + std::array local_isum; - size_t i = chunk_gid * n_wi; - for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) { - constexpr outputT out_zero(0); + size_t i = chunk_gid * n_wi; + +#pragma unroll + for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) { + constexpr outputT out_zero(0); - local_isum[m_wi] = - (i + m_wi < n_elems) - ? transformer(input[indexer(s0 + s1 * (i + m_wi))]) - : out_zero; - } + local_isum[m_wi] = + (i + m_wi < n_elems) + ? transformer(input[indexer(s0 + s1 * (i + m_wi))]) + : out_zero; + } #pragma unroll - for (size_t m_wi = 1; m_wi < n_wi; ++m_wi) { - local_isum[m_wi] += local_isum[m_wi - 1]; - } - // local_isum is now result of - // inclusive scan of locally stored inputs + for (nwiT m_wi = 1; m_wi < n_wi; ++m_wi) { + local_isum[m_wi] += local_isum[m_wi - 1]; + } + // local_isum is now result of + // inclusive scan of locally stored inputs - size_t wg_iscan_val = sycl::inclusive_scan_over_group( - it.get_group(), local_isum.back(), sycl::plus(), - size_t(0)); + outputT wg_iscan_val = sycl::inclusive_scan_over_group( + it.get_group(), local_isum.back(), sycl::plus(), + outputT(0)); - slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val; - it.barrier(sycl::access::fence_space::local_space); - size_t addand = (lid == 0) ? 0 : slm_iscan_tmp[lid]; + slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val; + it.barrier(sycl::access::fence_space::local_space); + outputT addand = (lid == 0) ? outputT(0) : slm_iscan_tmp[lid]; #pragma unroll - for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) { - local_isum[m_wi] += addand; - } - - for (size_t m_wi = 0; m_wi < n_wi && i + m_wi < n_elems; ++m_wi) - { - output[i + m_wi] = local_isum[m_wi]; - } - }); + for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) { + local_isum[m_wi] += addand; + } + + for (nwiT m_wi = 0; (m_wi < n_wi) && (i + m_wi < n_elems); ++m_wi) { + output[i + m_wi] = local_isum[m_wi]; + } }); + }); - sycl::event out_event = inc_scan_phase1_ev; - if (n_groups > 1) { - outputT *temp = sycl::malloc_device(n_groups - 1, exec_q); + return inc_scan_phase1_ev; +} + +namespace +{ +template class stack_t +{ + T *src_; + size_t size_; + T *local_scans_; + +public: + stack_t() : src_{}, size_{}, local_scans_{} {} + stack_t(T *src, size_t sz, T *local_scans) + : src_(src), size_(sz), local_scans_(local_scans) + { + } + ~stack_t(){}; + + T *get_src_ptr() const + { + return src_; + } + + const T *get_src_const_ptr() const + { + return src_; + } + + size_t get_size() const + { + return size_; + } + + T *get_local_scans_ptr() const + { + return local_scans_; + } +}; +} // end of anonymous namespace + +/* + * for integer type maskT, + * output[j] = sum( input[s0 + i * s1], 0 <= i <= j) + * for 0 <= j < n_elems + */ +template +sycl::event inclusive_scan_iter(sycl::queue &exec_q, + const size_t wg_size, + const size_t n_elems, + const inputT *input, + outputT *output, + const size_t s0, + const size_t s1, + IndexerT indexer, + TransformerT transformer, + std::vector &host_tasks, + const std::vector &depends = {}) +{ + size_t n_groups; + sycl::event inc_scan_phase1_ev = + inclusive_scan_base_step( + exec_q, wg_size, n_elems, input, output, s0, s1, indexer, + transformer, n_groups, depends); + sycl::event dependent_event = inc_scan_phase1_ev; + if (n_groups > 1) { auto chunk_size = wg_size * n_wi; + // how much of temporary allocation do we need + size_t n_groups_ = n_groups; + size_t temp_size = 0; + while (n_groups_ > 1) { + const auto this_size = (n_groups_ - 1); + temp_size += this_size; + n_groups_ = ceiling_quotient(this_size, chunk_size); + } + + // allocate + outputT *temp = sycl::malloc_device(temp_size, exec_q); + + if (!temp) { + throw std::bad_alloc(); + } + + std::vector> stack{}; + + // inclusive scans over blocks + n_groups_ = n_groups; + outputT *src = output; + outputT *local_scans = temp; + NoOpIndexer _no_op_indexer{}; - NoOpTransformer _no_op_transformer{}; - auto e2 = inclusive_scan_rec( - exec_q, n_groups - 1, wg_size, output, temp, chunk_size - 1, - chunk_size, _no_op_indexer, _no_op_transformer, host_tasks, - {inc_scan_phase1_ev}); - - // output[ chunk_size * (i + 1) + j] += temp[i] - auto e3 = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(e2); - cgh.parallel_for>( - {n_elems}, [=](auto wiid) - { - auto gid = wiid[0]; - auto i = (gid / chunk_size); - output[gid] += (i > 0) ? temp[i - 1] : 0; + using NoOpTransformerT = NoOpTransformer; + NoOpTransformerT _no_op_transformer{}; + size_t size_to_update = n_elems; + while (n_groups_ > 1) { + + size_t src_size = n_groups_ - 1; + dependent_event = + inclusive_scan_base_step( + exec_q, wg_size, src_size, src, local_scans, chunk_size - 1, + chunk_size, _no_op_indexer, _no_op_transformer, + n_groups_, // n_groups_ is modified in place + {dependent_event}); + stack.push_back({src, size_to_update, local_scans}); + src = local_scans; + local_scans += src_size; + size_to_update = src_size; + } + + for (size_t reverse_stack_id = 0; reverse_stack_id < stack.size(); + ++reverse_stack_id) + { + auto stack_id = stack.size() - 1 - reverse_stack_id; + + auto stack_elem = stack[stack_id]; + outputT *src = stack_elem.get_src_ptr(); + size_t src_size = stack_elem.get_size(); + outputT *local_scans = stack_elem.get_local_scans_ptr(); + + // output[ chunk_size * (i + 1) + j] += temp[i] + dependent_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_event); + + using UpdateKernelName = + class inclusive_scan_iter_chunk_update_krn< + inputT, outputT, n_wi, IndexerT, TransformerT, + NoOpIndexer, NoOpTransformerT>; + + cgh.parallel_for( + {src_size}, [chunk_size, src, local_scans](auto wiid) { + auto gid = wiid[0]; + auto i = (gid / chunk_size); + src[gid] += (i > 0) ? local_scans[i - 1] : outputT(0); + }); }); - }); + } sycl::event e4 = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(e3); + cgh.depends_on(dependent_event); const auto &ctx = exec_q.get_context(); cgh.host_task([ctx, temp]() { sycl::free(temp, ctx); }); }); host_tasks.push_back(e4); - - out_event = std::move(e3); } - return out_event; + return dependent_event; } typedef size_t (*accumulate_contig_impl_fn_ptr_t)( @@ -234,20 +349,33 @@ size_t accumulate_contig_impl(sycl::queue &q, std::vector &host_tasks, const std::vector &depends = {}) { - constexpr int n_wi = 8; const maskT *mask_data_ptr = reinterpret_cast(mask); cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); - size_t wg_size = 128; NoOpIndexer flat_indexer{}; transformerT non_zero_indicator{}; - const sycl::event &comp_ev = - inclusive_scan_rec( - q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1, - flat_indexer, non_zero_indicator, host_tasks, depends); + sycl::event comp_ev; + const sycl::device &dev = q.get_device(); + if (dev.has(sycl::aspect::cpu)) { + constexpr nwiT n_wi_for_cpu = 8; + size_t wg_size = 256; + comp_ev = inclusive_scan_iter( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1, + flat_indexer, non_zero_indicator, host_tasks, depends); + } + else { + constexpr nwiT n_wi_for_gpu = 4; + size_t wg_size = 256; + comp_ev = inclusive_scan_iter( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1, + flat_indexer, non_zero_indicator, host_tasks, depends); + } cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); cumsumT *last_elem_host_usm = sycl::malloc_host(1, q); @@ -324,19 +452,32 @@ size_t accumulate_strided_impl(sycl::queue &q, std::vector &host_tasks, const std::vector &depends = {}) { - constexpr int n_wi = 8; const maskT *mask_data_ptr = reinterpret_cast(mask); cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); - size_t wg_size = 128; StridedIndexer strided_indexer{nd, 0, shape_strides}; transformerT non_zero_indicator{}; - const sycl::event &comp_ev = - inclusive_scan_rec( - q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1, + const sycl::device &dev = q.get_device(); + sycl::event comp_ev; + if (dev.has(sycl::aspect::cpu)) { + constexpr nwiT n_wi_for_cpu = 8; + size_t wg_size = 256; + comp_ev = inclusive_scan_iter( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1, strided_indexer, non_zero_indicator, host_tasks, depends); + } + else { + constexpr nwiT n_wi_for_gpu = 4; + size_t wg_size = 256; + comp_ev = inclusive_scan_iter( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1, + strided_indexer, non_zero_indicator, host_tasks, depends); + } cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);