diff --git a/include/common/core/memory.hpp b/include/common/core/memory.hpp index 5854537dc..b1b031226 100644 --- a/include/common/core/memory.hpp +++ b/include/common/core/memory.hpp @@ -355,9 +355,13 @@ __XETLA_API xetla_vector xetla_load_global( __ESIMD_NS::cache_hint_L1, __ESIMD_NS::cache_hint_L2, __ESIMD_NS::alignment}; - if constexpr (sizeof(T) * N < sizeof(uint32_t)) { - xetla_vector offsets(byte_offset, sizeof(T)); - return __ESIMD_NS::gather(ptr, offsets); + if constexpr (sizeof(T) * N < sizeof(uint32_t) || N == 1) { + xetla_vector ret; +#pragma unroll + for (uint32_t i = 0; i < N; i++) { + ret[i] = ptr[i + byte_offset / sizeof(T)]; + } + return ret; } else { return __ESIMD_NS::block_load(ptr, byte_offset, props); } @@ -501,9 +505,11 @@ __XETLA_API void xetla_store_global( __ESIMD_NS::cache_hint_L2, __ESIMD_NS::alignment}; - if constexpr (sizeof(T) * N < sizeof(uint32_t)) { - xetla_vector offsets(byte_offset, sizeof(T)); - return __ESIMD_NS::scatter(ptr, offsets, vals); + if constexpr (sizeof(T) * N < sizeof(uint32_t) || N == 1) { +#pragma unroll + for (uint32_t i = 0; i < N; i++) { + ptr[i + byte_offset / sizeof(T)] = vals[i]; + } } else { __ESIMD_NS::block_store(ptr, byte_offset, vals, props); }