From dfdfb0f937b7600ad3aedc4f2038676f53ccecf8 Mon Sep 17 00:00:00 2001
From: ThanatosShinji <108169286+ThanatosShinji@users.noreply.github.com>
Date: Thu, 25 Apr 2024 10:32:24 +0800
Subject: [PATCH] [BesTLA] The initial SYCL support (#229)

* sycl init

* add helper

* add epilogue base

* launcher done

* dequant code

* add s4sgemm

* add sgemv

* add trans B support

* add hgemm

* finish half gemms

* add dequant kernels

* keep TILEK code

* enable all cases

* fix perf on MTL

* update gemv

* update fp16 performance

* add half for getweight

* add tail process for gemm and epilogue

* remove sycl sources when disabled

* protect mha from unsupported compilers
---
 CMakePresets.json                            |  12 +
 bestla/CMakeLists.txt                        |  37 +-
 bestla/CMakePresets.json                     |  28 +
 bestla/bestla/bestla_utils.h                 |  14 +-
 bestla/bestla/kernel_avx2.h                  |  12 +-
 bestla/bestla/kernel_avx512_bf16.h           |   6 +
 bestla/bestla/kernel_wrapper.h               |  32 +-
 bestla/bestla/sycl/sycl_device.h             |  73 ++
 bestla/bestla/sycl/sycl_epilogue.h           |  59 ++
 bestla/bestla/sycl/sycl_gemm.h               | 227 ++++++
 bestla/bestla/sycl/sycl_prologue_a.h         |  41 ++
 bestla/bestla/sycl/sycl_prologue_b.h         | 455 ++++++++++++
 bestla/bestla/sycl/sycl_utils.h              | 110 +++
 bestla/bestla/sycl/sycl_wrapper.h            | 216 ++++++
 bestla/bestla/ut/kernel_intrin.cpp           |   4 +
 bestla/bestla/ut/sycl_benchmark.cpp          | 729 +++++++++++++++++++
 bestla/bestla/ut/sycl_gemm.cpp               | 485 ++++++++++++
 bestla/bestla/ut/sycl_misc.cpp               |  34 +
 bestla/bestla/ut/sycl_ut.h                   |  16 +
 bestla/cmake/sycl.cmake                      |   3 +
 neural_speed/core/layers/mha_dense.cpp       |   2 +
 neural_speed/core/layers/mha_dense_wrapper.h |  25 +-
 22 files changed, 2590 insertions(+), 30 deletions(-)
 create mode 100644 bestla/bestla/sycl/sycl_device.h
 create mode 100644 bestla/bestla/sycl/sycl_epilogue.h
 create mode 100644 bestla/bestla/sycl/sycl_gemm.h
 create mode 100644 bestla/bestla/sycl/sycl_prologue_a.h
 create mode 100644 bestla/bestla/sycl/sycl_prologue_b.h
 create mode 100644 bestla/bestla/sycl/sycl_utils.h
 create mode 100644 bestla/bestla/sycl/sycl_wrapper.h
 create mode 100644 bestla/bestla/ut/sycl_benchmark.cpp
 create mode 100644 bestla/bestla/ut/sycl_gemm.cpp
 create mode 100644 bestla/bestla/ut/sycl_misc.cpp
 create mode 100644 bestla/bestla/ut/sycl_ut.h
 create mode 100644 bestla/cmake/sycl.cmake

diff --git a/CMakePresets.json b/CMakePresets.json
index 6ad6836b1..3a0694af9 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -106,6 +106,18 @@
       "cacheVariables": {
         "BTLA_UT_OPENMP": "OFF"
       }
+    },
+    {
+      "name": "x64-release-sycl",
+      "displayName": "x64 Release SYCL",
+      "description": "x64 SYCL",
+      "inherits": "x64-debug",
+      "cacheVariables": {
+        "CMAKE_CXX_COMPILER": "icx-cl",
+        "CMAKE_C_COMPILER": "icx-cl",
+        "CMAKE_BUILD_TYPE": "Release",
+        "BTLA_UT_ALL": "ON"
+      }
     }
   ]
 }
diff --git a/bestla/CMakeLists.txt b/bestla/CMakeLists.txt
index a3082acca..e11ea875c 100644
--- a/bestla/CMakeLists.txt
+++ b/bestla/CMakeLists.txt
@@ -5,6 +5,7 @@ file(GLOB headers ${PROJECT_NAME}/*.h ${PROJECT_NAME}/*.hpp)
 file(GLOB xbyak_headers ${PROJECT_NAME}/xbyak/*.h ${PROJECT_NAME}/xbyak/*.hpp)
 
 option(BTLA_ENABLE_OPENMP "Compile OpenMP thread pool if OMP can be found" OFF)
+option(BTLA_SYCL "Compile OpenMP thread pool if OMP can be found" OFF)
 
 option(BTLA_UT_ALL "Enable all unit tests" OFF)
 option(BTLA_UT_DEBUG "Enable debug unit tests" OFF)
@@ -21,6 +22,10 @@ option(BTLA_UT_NOASAN "Disable sanitize" OFF)
 option(BTLA_UT_BENCHMARK "Benchmark ON may take a long time to finish all tests" OFF)
 option(BTLA_UT_OPENMP "Use OpenMP for UT tests" OFF)
 
+
+
+
+
 add_library(${PROJECT_NAME} INTERFACE)
 add_library(neural_speed::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
 target_include_directories(
@@ -28,7 +33,15 @@ target_include_directories(
 	"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
 	"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>"
 )
-
+set(sycl_headers)
+set(sycl_libs)
+if(BTLA_SYCL)
+  include(cmake/sycl.cmake)
+  file(GLOB sycl_headers ${PROJECT_NAME}/sycl/*.h ${PROJECT_NAME}/sycl/*.hpp)
+  add_compile_definitions(BTLA_SYCL)
+  list(APPEND sycl_libs IntelSYCL::SYCL_CXX)
+  #add_link_options(-fsycl-targets=spir64 -Xsycl-target-backend "-options -ze-opt-large-register-file")
+endif(BTLA_SYCL)
 
 if(BTLA_ENABLE_OPENMP)
   message(STATUS "BesTLA enable OpenMP ThreadPool")
@@ -69,12 +82,20 @@ function(add_ut_flag UT_OPTION)
 	endif()
 endfunction()
 
+set(benchmark_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/bestla_benchmark.cpp)
+# list(APPEND benchmark_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/sycl_benchmark.cpp)
+
+
 if(UT_BUILD)
 	file(GLOB srcs ${PROJECT_NAME}/ut/*.cc ${PROJECT_NAME}/ut/*.cpp) #compile everything even run parts of UTs
-  list(REMOVE_ITEM srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/bestla_benchmark.cpp)
+  file(GLOB sycl_srcs ${PROJECT_NAME}/ut/sycl*)
+  if(NOT BTLA_SYCL)
+    list(REMOVE_ITEM srcs ${sycl_srcs})
+  endif()
+  list(REMOVE_ITEM srcs ${benchmark_srcs})
 	file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
   include_directories(${PROJECT_NAME})
-	add_executable(${PROJECT_NAME}_ut ${srcs} ${headers} ${ut_headers})
+	add_executable(${PROJECT_NAME}_ut ${srcs} ${headers} ${sycl_headers} ${ut_headers})
   if(BTLA_UT_OPENMP)
     include(FindOpenMP)
     target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
@@ -98,14 +119,16 @@ if(UT_BUILD)
 	add_ut_flag(BTLA_UT_KERNEL_INTRIN)
 	add_ut_flag(BTLA_UT_KERNEL_JIT)
 	add_ut_flag(BTLA_UT_KERNEL_WRAPPER)
-	target_link_libraries(${PROJECT_NAME}_ut PRIVATE ${PROJECT_NAME})
+  if(BTLA_SYCL)
+    add_compile_definitions(BTLA_UT_SYCL)
+  endif()
+	target_link_libraries(${PROJECT_NAME}_ut PRIVATE ${PROJECT_NAME} ${sycl_libs})
 endif(UT_BUILD)
 
 if(BTLA_UT_BENCHMARK)
-  file(GLOB srcs ${PROJECT_NAME}/ut/bestla_benchmark.cpp) #compile everything even run parts of UTs
   file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
   include_directories(${PROJECT_NAME})
-	add_executable(${PROJECT_NAME}_benchmark ${srcs} ${headers} ${ut_headers})
+	add_executable(${PROJECT_NAME}_benchmark ${benchmark_srcs} ${headers} ${ut_headers})
   if(BTLA_UT_OPENMP)
     include(FindOpenMP)
     target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
@@ -114,5 +137,5 @@ if(BTLA_UT_BENCHMARK)
   if(NOT WIN32)
 		target_link_options(${PROJECT_NAME}_benchmark PRIVATE -lpthread)
 	endif()
-  target_link_libraries(${PROJECT_NAME}_benchmark PRIVATE ${PROJECT_NAME})
+  target_link_libraries(${PROJECT_NAME}_benchmark PRIVATE ${PROJECT_NAME} ${sycl_libs})
 endif(BTLA_UT_BENCHMARK)
diff --git a/bestla/CMakePresets.json b/bestla/CMakePresets.json
index 3fa3071ae..7187120ff 100644
--- a/bestla/CMakePresets.json
+++ b/bestla/CMakePresets.json
@@ -83,6 +83,34 @@
       "description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
       "inherits": "x64-release",
       "cacheVariables": { "BTLA_UT_ALL": "ON" }
+    },
+    {
+      "name": "x64-debug-sycl",
+      "displayName": "x64 Debug SYCL",
+      "description": "x64 Debug SYCL",
+      "inherits": "windows-base",
+      "architecture": {
+        "value": "x64",
+        "strategy": "external"
+      },
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Debug",
+        "BTLA_UT_DEBUG": "ON",
+        "BTLA_UT_ALL": "OFF",
+        "BTLA_SYCL": "ON",
+        "BTLA_UT_BENCHMARK": "ON",
+        "CMAKE_CXX_COMPILER": "icx",
+        "CMAKE_C_COMPILER": "icx"
+      }
+    },
+    {
+      "name": "x64-release-sycl",
+      "displayName": "x64 Release for SYCL",
+      "description": "x64 SYCL",
+      "inherits": "x64-debug-sycl",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Release"
+      }
     }
   ]
 }
diff --git a/bestla/bestla/bestla_utils.h b/bestla/bestla/bestla_utils.h
index 284fadb5f..17e24b75e 100644
--- a/bestla/bestla/bestla_utils.h
+++ b/bestla/bestla/bestla_utils.h
@@ -70,7 +70,7 @@
 #define CompileAMXINT8() (CompileAMX())
 #endif
 
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && !defined(__INTEL_LLVM_COMPILER)
 #define CompileAVX512F() _MSC_VER && (_MSC_VER >= 1911)
 #define CompileAVX2() _MSC_VER && (_MSC_VER >= 1900)
 #define CompileAMX() 0
@@ -80,12 +80,12 @@
 #define CompileAMXINT8() 0
 #endif
 
-#ifdef __clang_major__
-#define CompileAVX512F() (__clang_major__ >= 4)
-#define CompileAVX2() (__clang_major__ >= 3)
-#define CompileAMX() (__clang_major__ >= 11)
-#define CompileBF16() (__clang_major__ >= 11)
-#define CompileFP16() (__clang_major__ >= 16)
+#if defined(_MSC_VER) && defined(__INTEL_LLVM_COMPILER)
+#define CompileAVX512F() defined(__AVX512F__)
+#define CompileAVX2() defined(__AVX2__) && defined(__F16C__) && defined(__FMA__)
+#define CompileAMX() 0
+#define CompileBF16() 0
+#define CompileFP16() 0
 #define CompileAMXBF16() (CompileAMX())
 #define CompileAMXINT8() (CompileAMX())
 #endif
diff --git a/bestla/bestla/kernel_avx2.h b/bestla/bestla/kernel_avx2.h
index a6899d8f5..e980fa90a 100644
--- a/bestla/bestla/kernel_avx2.h
+++ b/bestla/bestla/kernel_avx2.h
@@ -23,12 +23,12 @@ namespace bestla {
 namespace kernel {
 namespace avx2 {
 #if CompileAVX2()
-#ifdef __GNUC__
+#if defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("avx2", "fma", "f16c")
-#else
+#elif defined(ICX)
+#pragma clang attribute push(__attribute__((target("avx,avx2,fma"))), apply_to = function)
 #endif
-
 template <bool LowBits>
 static inline __m256i unpack_4bits_avx2(void* srcptr, __m256i mask) {
   auto raw_data = _mm_loadu_si128(reinterpret_cast<__m128i*>(srcptr));
@@ -74,7 +74,7 @@ inline __m256 ymm_cvt_bf16_fp32(__m128i vbf16) {
 
 inline __m128i ymm_cvtepi32_epi16(__m256i src) {
   __m128i tmp;
-#ifdef __GNUC__
+#if defined(__GNUC__) || defined(__clang_major__)
   for (size_t i = 0; i < 8; i++) {
     (reinterpret_cast<int16_t*>(&tmp))[i] = (reinterpret_cast<int32_t*>(&src))[i];
   }
@@ -443,7 +443,7 @@ inline BTLA_CODE decompress_kblock_f8_fp(utils::f8* srcptr, _DST_T* dstptr, int
       e_revert = _mm256_srli_epi32(e_revert, mantissabit);
       if constexpr (WITH_SCALE && std::is_same_v<_S_T, utils::f8>) {
         auto scale = _mm256_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(sptr + j / _PACK_ROW)));
-        if constexpr (_PACK_ROW == 2) scale = _mm256_permutexvar_epi32(packrow2_permute_idx, scale);
+        if constexpr (_PACK_ROW == 2) scale = _mm256_permutevar8x32_epi32(packrow2_permute_idx, scale);
         e_revert = _mm256_add_epi32(e_revert, scale);
       }
       e_revert = _mm256_sub_epi32(e_revert, e_revert_shift);
@@ -454,7 +454,7 @@ inline BTLA_CODE decompress_kblock_f8_fp(utils::f8* srcptr, _DST_T* dstptr, int
       fp_v = _mm256_or_ps(fp_v, _mm256_castsi256_ps(mantissa_revert));
       if constexpr (WITH_SCALE && std::is_same_v<_S_T, float>) {
         auto scale = _mm256_loadu_ps(sptr + j / _PACK_ROW);
-        if constexpr (_PACK_ROW == 2) scale = _mm256_permutexvar_ps(packrow2_permute_idx, scale);
+        if constexpr (_PACK_ROW == 2) scale = _mm256_permutevar8x32_ps(scale, packrow2_permute_idx);
         fp_v = _mm256_mul_ps(fp_v, scale);
       }
       if constexpr (std::is_same_v<_DST_T, float>) {
diff --git a/bestla/bestla/kernel_avx512_bf16.h b/bestla/bestla/kernel_avx512_bf16.h
index 453b88afd..ece55a5dd 100644
--- a/bestla/bestla/kernel_avx512_bf16.h
+++ b/bestla/bestla/kernel_avx512_bf16.h
@@ -47,7 +47,10 @@ static inline BTLA_CODE bf16_cvt_fp32_2D_write_back(const utils::bf16* src_ptr,
   }
   return BTLA_CODE::Success;
 #endif
+#if CompileAVX512F()
   return avx512f::bf16_cvt_fp32_2D_write_back(src_ptr, dst_ptr, row, col, src_step, dst_step, zeropadding);
+#endif
+  return BTLA_CODE::NotSupport;
 }
 
 static inline BTLA_CODE fp32_cvt_bf16_2D_write_back(const void* raw_srcptr, void* raw_dstptr, int row, int col,
@@ -83,7 +86,10 @@ static inline BTLA_CODE fp32_cvt_bf16_2D_write_back(const void* raw_srcptr, void
   }
   return BTLA_CODE::Success;
 #endif
+#if CompileAVX512F()
   return avx512f::fp32_cvt_bf16_2D_write_back(raw_srcptr, raw_dstptr, row, col, srcstride, dststride, zeropadding);
+#endif
+  return BTLA_CODE::NotSupport;
 }
 #if CompileBF16()
 #pragma GCC pop_options
diff --git a/bestla/bestla/kernel_wrapper.h b/bestla/bestla/kernel_wrapper.h
index a9726f28b..f8751b3c9 100644
--- a/bestla/bestla/kernel_wrapper.h
+++ b/bestla/bestla/kernel_wrapper.h
@@ -34,11 +34,13 @@ class PaddingInterleaveMN {
   template <BTLA_ISA ISA_T, typename T_SRC, typename T_DST = T_SRC>
   static BTLA_CODE forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
                            int dst_step) {
+#if CompileAVX512F()
     if constexpr (utils::isa_base<ISA_T>::avx512f) {
       const auto kern_ret = kernel::avx512f::padding_interleave_cvt<T_SRC, T_DST, RowPack>::forward(
           src, dst, NTile, row, col, row_pad, col_pad, src_step, dst_step);
       if (kern_ret != BTLA_CODE::NotSupport) return kern_ret;
     }
+#endif
     return ref::padding_interleave(src, dst, row, col, row_pad, col_pad, src_step, dst_step, NTile, RowPack);
   }
 };
@@ -62,12 +64,14 @@ class PaddingTransInterleaveMN {
   template <BTLA_ISA ISA_T, typename T_SRC, typename T_DST = T_SRC>
   static BTLA_CODE forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
                            int dst_step) {
+#if CompileAVX512F()
     // Note: rows/cols and i/j are in terms of src
     if constexpr (utils::isa_base<ISA_T>::avx512f) {
       const auto kern_ret = kernel::avx512f::padding_trans_interleave_cvt<T_SRC, T_DST, ColPack>::forward(
           src, dst, MTile, row, col, row_pad, col_pad, src_step, dst_step);
       if (kern_ret != BTLA_CODE::NotSupport) return kern_ret;
     }
+#endif
     return ref::padding_trans_interleave(src, dst, row, col, row_pad, col_pad, src_step, dst_step, MTile, ColPack);
   }
 };
@@ -85,7 +89,6 @@ class Memcpy2D {
         return ret;
       }
     }
-#if CompileAVX2()
     if constexpr (utils::isa_base<ISA_T>::avx2) {
       auto align_col = col * sizeof(_SRC_T) / 32 * 32 / sizeof(_SRC_T);
       ret = kernel::jit::JitMemcpy2DAvx2::forward<_SRC_T, _DST_T>(srcptr, dstptr, row, align_col, srcstep, dststep,
@@ -97,7 +100,6 @@ class Memcpy2D {
         return ret;
       }
     }
-#endif
     return kernel::ref::memcpy2d(srcptr, dstptr, row, col * sizeof(_SRC_T), srcstep * sizeof(_SRC_T),
                                  dststep * sizeof(_DST_T));
   }
@@ -106,7 +108,6 @@ class Memcpy2D {
   static BTLA_CODE forward1(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
                             void* const_elt_v = nullptr) {
     auto ret = BTLA_CODE::NotSupport;
-#if CompileAVX512F()
     if constexpr (utils::isa_base<ISA_T>::avx512f) {
       ret = kernel::jit::JitMemcpy2DAvx512f::forward1<_SRC_T, _DST_T, OP_T>(srcptr, dstptr, row, col, srcstep, dststep,
                                                                             const_elt_v);
@@ -114,8 +115,6 @@ class Memcpy2D {
         return ret;
       }
     }
-#endif
-#if CompileAVX2()
     if constexpr (utils::isa_base<ISA_T>::avx2) {
       auto align_col = col * sizeof(_SRC_T) / 32 * 32 / sizeof(_SRC_T);
       ret = kernel::jit::JitMemcpy2DAvx2::forward1<_SRC_T, _DST_T, OP_T>(srcptr, dstptr, row, align_col, srcstep,
@@ -128,7 +127,6 @@ class Memcpy2D {
         return ret;
       }
     }
-#endif
     return ref::memcpy2d_withop<_SRC_T, _DST_T, OP_T>(srcptr, dstptr, row, col, srcstep, dststep, const_elt_v);
   }
 };
@@ -504,10 +502,12 @@ class DecompressKBlockS4S8Fp {
                                                               reinterpret_cast<int8_t*>(tmp), tmpsize);
     }
 #endif
+#if CompileAVX2()
     if constexpr (utils::isa_base<ISA_T>::avx2) {
       return avx2::decompress_kblock_s4_s8fp<S4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
                                                            reinterpret_cast<int8_t*>(tmp), tmpsize);
     }
+#endif
     return ref::decompress_kblock_s4_s8fp<S4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
                                                         reinterpret_cast<int8_t*>(tmp), tmpsize);
   }
@@ -605,14 +605,18 @@ class DecompressKBlockF4FpNoscale {
   static inline BTLA_CODE forward(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
                                   void* tmp, size_t tmpsize) {
     BTLA_CODE ret = BTLA_CODE::NotSupport;
+#if CompileAVX512F()
     if constexpr (utils::isa_base<ISA_T>::avx512f) {
       return avx512f::decompress_kblock_f4_fp_noscale<F4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
                                                                     reinterpret_cast<int8_t*>(tmp), tmpsize);
     }
+#endif
+#if CompileAVX2()
     if constexpr (utils::isa_base<ISA_T>::avx2) {
       return avx2::decompress_kblock_f4_fp_noscale<F4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
                                                                  reinterpret_cast<int8_t*>(tmp), tmpsize);
     }
+#endif
     return ref::decompress_kblock_f4_fp_noscale<F4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
                                                               reinterpret_cast<int8_t*>(tmp), tmpsize);
   }
@@ -669,12 +673,10 @@ class DecompressKBlockS8Fp {
   static inline BTLA_CODE forward(int8_t* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
                                   SCA_T* scales, int8_t* zero_points, int k_offset, int kblock, int NPad, void* tmp,
                                   size_t tmpsize) {
-#if CompileAVX512F()
     if constexpr (utils::isa_base<ISA_T>::avx512f && std::is_same_v<SCA_T, float>) {  // TODO Scale type support
       return jit::DequanKBlockS8Fp::forward_avx512f<PACK_ROW>(srcptr, dstptr, row, col, ld_src, ld_dst, scales,
                                                               zero_points, k_offset, kblock, NPad);
     }
-#endif
 #if CompileAVX2()
     // PACK_ROW must be 1/4 when using avx2 proB.
     if constexpr (utils::isa_base<ISA_T>::avx2 && std::is_same_v<SCA_T, float> &&
@@ -694,12 +696,16 @@ class DecompressKBlockS8S8Fp {
   template <BTLA_ISA ISA_T>
   static inline BTLA_CODE forward(int8_t* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst, void* tmp,
                                   size_t tmpsize) {
+#if CompileAVX512F()
     if constexpr (utils::isa_base<ISA_T>::avx512f) {  // TODO Scale type support
       return avx512f::decompress_kblock_s8_s8fp<_DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst);
     }
+#endif
+#if CompileAVX2()
     if constexpr (utils::isa_base<ISA_T>::avx2) {  // TODO Scale type support
       return avx2::decompress_kblock_s8_s8fp<_DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst);
     }
+#endif
     return ref::decompress_kblock_s8_s8fp<_DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst);
   }
 };
@@ -756,9 +762,11 @@ class CompFp32BlockScale {
       return avx512f::accum_alphaN_f32_f32(alpha, srcptr, srcstep, dstptr, dststep, M, N);
     }
 #endif
+#if CompileAVX2()
     if constexpr (utils::isa_base<ISA_T>::avx2) {
       return avx2::accum_alphaN_f32_f32(alpha, srcptr, srcstep, dstptr, dststep, M, N);
     }
+#endif
     return ref::accum_alphaN_f32_f32(alpha, srcptr, srcstep, dstptr, dststep, M, N);
   }
 };
@@ -845,12 +853,16 @@ class ColBlockReduceSum {
   template <BTLA_ISA ISA_T, typename SRC_T>
   static inline BTLA_CODE forward(const SRC_T* srcptr, int ldsrc, int row, int col, int blocksize, float* reduce,
                                   int ldr) {
+#if CompileAVX512F()
     if constexpr (utils::isa_base<ISA_T>::avx512f && std::is_same_v<SRC_T, float>) {
       return avx512f::col_block_reduce_sum<SRC_T>(srcptr, ldsrc, row, col, blocksize, reduce, ldr);
     }
+#endif
+#if CompileAVX2()
     if constexpr (utils::isa_base<ISA_T>::avx2 && std::is_same_v<SRC_T, float>) {
       return avx2::col_block_reduce_sum<SRC_T>(srcptr, ldsrc, row, col, blocksize, reduce, ldr);
     }
+#endif
     return ref::col_block_reduce_sum<SRC_T>(srcptr, ldsrc, row, col, blocksize, reduce, ldr);
   }
 };
@@ -911,12 +923,16 @@ class LayerNormalization {
   template <BTLA_ISA ISA_T, typename T>
   static inline BTLA_CODE forward(const T* srcptr, const T* scaleptr, const T* biasptr, T epsilon, int norm_size,
                                   T* dstptr, T* mean, T* mean_square, bool simplified) {
+#if CompileAVX512F()
     if constexpr (utils::isa_base<ISA_T>::avx512f && std::is_same_v<T, float>) {
       return avx512f::layernorm(srcptr, scaleptr, biasptr, epsilon, norm_size, dstptr, mean, mean_square, simplified);
     }
+#endif
+#if CompileAVX2()
     if constexpr (utils::isa_base<ISA_T>::avx2 && std::is_same_v<T, float>) {
       return avx2::layernorm(srcptr, scaleptr, biasptr, epsilon, norm_size, dstptr, mean, mean_square, simplified);
     }
+#endif
     return ref::layernorm(srcptr, scaleptr, biasptr, epsilon, norm_size, dstptr, mean, mean_square, simplified);
   }
   template <typename T>
diff --git a/bestla/bestla/sycl/sycl_device.h b/bestla/bestla/sycl/sycl_device.h
new file mode 100644
index 000000000..c23d241c1
--- /dev/null
+++ b/bestla/bestla/sycl/sycl_device.h
@@ -0,0 +1,73 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#include <map>
+#include <thread>
+#include <vector>
+#include <sycl/sycl.hpp>
+
+namespace bestla {
+
+namespace sycl_device {
+
+class SyclDevice {
+ public:
+  SyclDevice(bool profile) {
+    // Create an exception handler for asynchronous SYCL exceptions
+    static auto exception_handler = [](sycl::exception_list e_list) {
+      for (std::exception_ptr const& e : e_list) {
+        try {
+          std::rethrow_exception(e);
+        } catch (std::exception const& e) {
+#if _DEBUG
+          std::cout << "Failure" << std::endl;
+#endif
+          std::terminate();
+        }
+      }
+    };
+
+    auto d_selector{sycl::default_selector_v};
+    if (profile) {
+      sycl::property_list prop = {sycl::property::queue::enable_profiling()};
+      mQueue = sycl::queue(d_selector, exception_handler, prop);
+    } else {
+      mQueue = sycl::queue(d_selector, exception_handler);
+    }
+  }
+
+  inline sycl::queue* getQueue() { return &mQueue; }
+
+  inline std::string getName() { return mQueue.get_device().get_info<sycl::info::device::name>(); };
+
+  void print() {
+    std::cout << "Running on device: " << mQueue.get_device().get_info<sycl::info::device::name>() << "\n";
+    std::cout << "EU count:" << mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_eu_count>()
+              << "\n";  // 448
+    std::cout << "EU count per subslice:"
+              << mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_eu_count_per_subslice>() << "\n";  // 8
+    std::cout << "EU SIMD width:" << mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>()
+              << "\n";  // 8
+    std::cout << "HW threads per EU:"
+              << mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>() << "\n";  // 8
+    std::cout << "GPU slices:" << mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_slices>()
+              << "\n";  // 7
+    std::cout << "Subslice per slice:"
+              << mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_subslices_per_slice>() << "\n";  // 8
+  }
+  sycl::queue mQueue;
+};
+
+}  // namespace sycl_device
+}  // namespace bestla
diff --git a/bestla/bestla/sycl/sycl_epilogue.h b/bestla/bestla/sycl/sycl_epilogue.h
new file mode 100644
index 000000000..caa3ef062
--- /dev/null
+++ b/bestla/bestla/sycl/sycl_epilogue.h
@@ -0,0 +1,59 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+
+#ifdef BTLA_SYCL
+#include <sycl/sycl.hpp>
+
+#include "sycl_utils.h"
+
+namespace bestla {
+namespace sycl_epilogue {
+template <typename DstT>
+struct ParamOutputBase {
+  DstT* C;
+  int ldc;
+};
+template <class GemmCoreT, typename DstT>
+class OutputBase {
+ public:
+  using CType = typename GemmCoreT::TACC;
+  using DstType = DstT;
+  using Param = ParamOutputBase<DstType>;
+  static inline void store(const Param& _param, CType* tmpAcc, const sycl_utils::nd_item_helper<GemmCoreT>& helper) {
+#pragma unroll
+    for (int im = 0; im < GemmCoreT::TileM; im++) {
+#pragma unroll
+      for (int in = 0; in < GemmCoreT::TileN; in++) {
+        _param.C[(helper.item_g_m() + im) * _param.ldc + helper.item_g_n() + in] = tmpAcc[im * GemmCoreT::TileN + in];
+      }
+    }
+  }
+
+  static inline void store_tail(const Param& _param, CType* tmpAcc, const sycl_utils::nd_item_helper<GemmCoreT>& helper,
+                                int m_tail) {
+    if (m_tail) {
+      for (int im = 0; im < m_tail; im++) {
+#pragma unroll
+        for (int in = 0; in < GemmCoreT::TileN; in++) {
+          _param.C[(helper.item_g_m() + im) * _param.ldc + helper.item_g_n() + in] = tmpAcc[im * GemmCoreT::TileN + in];
+        }
+      }
+    }
+  }
+};
+
+}  // namespace sycl_epilogue
+}  // namespace bestla
+#endif
diff --git a/bestla/bestla/sycl/sycl_gemm.h b/bestla/bestla/sycl/sycl_gemm.h
new file mode 100644
index 000000000..7ba1e7963
--- /dev/null
+++ b/bestla/bestla/sycl/sycl_gemm.h
@@ -0,0 +1,227 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+
+#ifdef BTLA_SYCL
+#include <array>
+
+#include "bestla_utils.h"
+#include <sycl/sycl.hpp>
+
+namespace bestla {
+namespace sycl_gemm {
+namespace xve {
+class Config_Fp32Fp32Fp32 {
+ public:
+  static int constexpr sg_size = 16;
+  static int constexpr sg_m = 16;
+  static int constexpr sg_n = 2;
+  static int constexpr sg_k = 32;
+  static int constexpr unroll_k = 4;
+  static int constexpr wg_m = 8;
+  static int constexpr wg_n = 32;
+
+  using data_type_a = float;
+  using data_type_b = float;
+  using data_type_c = float;
+  using data_type_acc = float;
+};
+
+template <class ConfigT>
+class SGemmCoreSharedB {
+ public:
+  static int constexpr SgSize = ConfigT::sg_size;
+  static int constexpr WgM = ConfigT::wg_m;
+  static int constexpr WgN = ConfigT::wg_n;
+  static int constexpr SgNStride = WgN / SgSize;
+  static int constexpr WgWorkers = WgM * WgN;
+  static int constexpr SgCount = WgWorkers / SgSize;
+  static int constexpr TileM = ConfigT::sg_m;
+  static int constexpr TileN = ConfigT::sg_n;
+  static int constexpr TileK = ConfigT::sg_k;
+  static int constexpr UnrollK = ConfigT::unroll_k;
+  static int constexpr WgNEle = WgN * TileN;
+  static int constexpr WgMEle = WgM * TileM;
+  static int constexpr SgNEle = SgSize * TileN;
+  static int constexpr SLM_B_Size = WgNEle * TileK;
+  static int constexpr SLM_A_Size = 0;
+
+  using TA = typename ConfigT::data_type_a;
+  using TB = typename ConfigT::data_type_b;
+  using TC = typename ConfigT::data_type_c;
+  using TACC = typename ConfigT::data_type_acc;
+
+  using SLM_B_Acc = sycl::local_accessor<TB, 1>;
+
+  static inline void compute(const TA* aptr, int lda, const SLM_B_Acc& bacc, TACC* accptr,
+                             const sycl_utils::nd_item_helper<SGemmCoreSharedB<ConfigT>>& helper) {
+#pragma unroll(1)
+    for (int ik = 0; ik < TileK; ik += UnrollK) {
+      int constexpr MReg = TileM / SgSize;
+      TA regA[UnrollK * MReg];
+      for (int im = 0; im < MReg; im++) {
+        *(sycl::vec<TA, UnrollK>*)&regA[im * UnrollK] =
+            *(sycl::vec<TA, UnrollK>*)&aptr[(helper.sg_id() + im * SgSize) * lda + ik];
+      }
+
+#pragma unroll
+      for (int ikk = 0; ikk < UnrollK; ikk++) {
+        TB tmpB[TileN];
+#pragma unroll
+        for (int in = 0; in < TileN; in++) {
+          tmpB[in] = bacc[helper.sg_idx_n() * SgNEle + helper.sg_id() * TileN + in + (ik + ikk) * WgNEle];
+        }
+#pragma unroll
+        for (size_t im = 0; im < TileM; im++) {
+          auto tmpA = helper.sg.shuffle(regA[ikk + im / SgSize * UnrollK], im % SgSize);
+#pragma unroll
+          for (size_t in = 0; in < TileN; in++) {
+            accptr[im * TileN + in] += tmpA * tmpB[in];
+          }
+        }
+      }
+    }
+  }
+
+  static inline void compute_mtail(const TA* aptr, int lda, const SLM_B_Acc& bacc, TACC* accptr,
+                                   const sycl_utils::nd_item_helper<SGemmCoreSharedB<ConfigT>>& helper, int& m_tail) {
+    if (m_tail > 0) {
+#pragma unroll(1)
+      for (int ik = 0; ik < TileK; ik += UnrollK) {
+        for (int ikk = 0; ikk < UnrollK; ikk++) {
+          TB tmpB[TileN];
+#pragma unroll
+          for (int in = 0; in < TileN; in++) {
+            tmpB[in] = bacc[helper.sg_idx_n() * SgNEle + helper.sg_id() * TileN + in + (ik + ikk) * WgNEle];
+          }
+          for (size_t im = 0; im < m_tail; im++) {
+            auto tmpA = aptr[im * lda + ik + ikk];
+#pragma unroll
+            for (size_t in = 0; in < TileN; in++) {
+              accptr[im * TileN + in] += tmpA * tmpB[in];
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+using DefaultSGemmCore = SGemmCoreSharedB<Config_Fp32Fp32Fp32>;
+
+class Config_Fp16Fp16Fp16 {
+ public:
+  static int constexpr sg_size = 16;
+  static int constexpr sg_m = 16;
+  static int constexpr sg_n = 4;
+  static int constexpr sg_k = 32;
+  static int constexpr unroll_k = 4;
+  static int constexpr wg_m = 16;
+  static int constexpr wg_n = 32;
+
+  using data_type_a = sycl::half;
+  using data_type_b = sycl::half;
+  using data_type_c = sycl::half;
+  using data_type_acc = sycl::half;
+};
+
+template <class ConfigT>
+class HGemmCoreSharedB {
+ public:
+  static int constexpr SgSize = ConfigT::sg_size;
+  static int constexpr WgM = ConfigT::wg_m;
+  static int constexpr WgN = ConfigT::wg_n;
+  static int constexpr SgNStride = WgN / SgSize;
+  static int constexpr WgWorkers = WgM * WgN;
+  static int constexpr SgCount = WgWorkers / SgSize;
+  static int constexpr TileM = ConfigT::sg_m;
+  static int constexpr TileN = ConfigT::sg_n;
+  static int constexpr TileK = ConfigT::sg_k;
+  static int constexpr UnrollK = ConfigT::unroll_k;
+  static int constexpr WgNEle = WgN * TileN;
+  static int constexpr WgMEle = WgM * TileM;
+  static int constexpr SgNEle = SgSize * TileN;
+  static int constexpr SLM_B_Size = WgNEle * TileK;
+  static int constexpr SLM_A_Size = 0;
+
+  using TA = typename ConfigT::data_type_a;
+  using TB = typename ConfigT::data_type_b;
+  using TC = typename ConfigT::data_type_c;
+  using TACC = typename ConfigT::data_type_acc;
+
+  using SLM_B_Acc = sycl::local_accessor<TB, 1>;
+
+  static inline void compute(const TA* aptr, int lda, const SLM_B_Acc& bacc, TACC* accptr,
+                             const sycl_utils::nd_item_helper<HGemmCoreSharedB<ConfigT>>& helper) {
+#pragma unroll(1)
+    for (int ik = 0; ik < TileK; ik += UnrollK) {
+      static_assert((UnrollK * sizeof(TA)) % sizeof(float) == 0);
+      int constexpr MReg = TileM / SgSize;
+      static_assert(MReg == 1);
+      TA regA[UnrollK * MReg];
+      for (int im = 0; im < MReg; im++) {
+        *(sycl::vec<TA, UnrollK>*)&regA[im * UnrollK] =
+            *(sycl::vec<TA, UnrollK>*)&aptr[(helper.sg_id() + im * SgSize) * lda + ik];
+      }
+#pragma unroll
+      for (int ikk = 0; ikk < UnrollK; ikk++) {
+        TB tmpB[TileN];
+#pragma unroll
+        for (int in = 0; in < TileN; in++) {
+          tmpB[in] = bacc[helper.sg_idx_n() * SgNEle + helper.sg_id() * TileN + in + (ik + ikk) * WgNEle];
+        }
+#pragma unroll
+        for (size_t im = 0; im < TileM; im++) {
+          auto tmpA = helper.sg.shuffle(regA[ikk + im / SgSize * UnrollK], im % SgSize);
+#pragma unroll
+          for (size_t in = 0; in < TileN; in++) {
+            accptr[im * TileN + in] += tmpA * tmpB[in];
+          }
+        }
+      }
+    }
+  }
+
+  static inline void compute_mtail(const TA* aptr, int lda, const SLM_B_Acc& bacc, TACC* accptr,
+                                   const sycl_utils::nd_item_helper<HGemmCoreSharedB<ConfigT>>& helper,
+                                   const int& m_tail) {
+    if (m_tail > 0) {
+#pragma unroll(1)
+      for (int ik = 0; ik < TileK; ik += UnrollK) {
+#pragma unroll
+        for (int ikk = 0; ikk < UnrollK; ikk++) {
+          TB tmpB[TileN];
+#pragma unroll
+          for (int in = 0; in < TileN; in++) {
+            tmpB[in] = bacc[helper.sg_idx_n() * SgNEle + helper.sg_id() * TileN + in + (ik + ikk) * WgNEle];
+          }
+          for (size_t im = 0; im < m_tail; im++) {
+            auto tmpA = aptr[im * lda + ik + ikk];
+#pragma unroll
+            for (size_t in = 0; in < TileN; in++) {
+              accptr[im * TileN + in] += tmpA * tmpB[in];
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+using DefaultHGemmCore = HGemmCoreSharedB<Config_Fp16Fp16Fp16>;
+}  // namespace xve
+
+}  // namespace sycl_gemm
+}  // namespace bestla
+#endif
diff --git a/bestla/bestla/sycl/sycl_prologue_a.h b/bestla/bestla/sycl/sycl_prologue_a.h
new file mode 100644
index 000000000..28350f276
--- /dev/null
+++ b/bestla/bestla/sycl/sycl_prologue_a.h
@@ -0,0 +1,41 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+
+#ifdef BTLA_SYCL
+#include <array>
+
+#include "bestla_utils.h"
+#include <sycl/sycl.hpp>
+
+namespace bestla {
+namespace sycl_prologue_a {
+
+template <typename SrcT>
+struct ParamActivationBase {
+  const SrcT* A;
+  int lda;
+};
+template <class GemmCoreT, typename SrcT>
+class ActivationBase {
+ public:
+  using AType = typename GemmCoreT::TA;
+  using SrcType = SrcT;
+  using Param = ParamActivationBase<SrcType>;
+  static inline void getActivation(const Param& _param, AType* aptr, sycl_utils::nd_item_helper<GemmCoreT>& helper) {}
+};
+
+}  // namespace sycl_prologue_a
+}  // namespace bestla
+#endif
diff --git a/bestla/bestla/sycl/sycl_prologue_b.h b/bestla/bestla/sycl/sycl_prologue_b.h
new file mode 100644
index 000000000..089a81dd5
--- /dev/null
+++ b/bestla/bestla/sycl/sycl_prologue_b.h
@@ -0,0 +1,455 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+
+#ifdef BTLA_SYCL
+#include <array>
+
+#include "bestla_utils.h"
+#include <sycl/sycl.hpp>
+
+namespace bestla {
+namespace sycl_prologue_b {
+
+template <typename SrcT>
+struct ParamWeightBase {
+  const SrcT* B;
+  int ldb;
+};
+template <class GemmCoreT, typename SrcT>
+class WeightBase {
+ public:
+  using BType = typename GemmCoreT::TB;
+  using SRCType = SrcT;
+  using Param = ParamWeightBase<SRCType>;
+
+  static inline void getWeight(const Param& _param, const sycl::local_accessor<BType, 1>& dstptr, int koffset,
+                               const sycl_utils::nd_item_helper<GemmCoreT>& helper) {
+    int constexpr Iter_PerWorker = (GemmCoreT::TileK + GemmCoreT::WgM - 1) / GemmCoreT::WgM;
+#pragma unroll
+    for (int icp = 0; icp < Iter_PerWorker; icp++) {
+      {
+        for (size_t in = 0; in < GemmCoreT::TileN; in++) {
+          dstptr[(helper.sg_idx_m() + icp * GemmCoreT::WgM) * GemmCoreT::WgNEle +
+                 (helper.sg_idx_n() * GemmCoreT::SgSize + helper.sg_id()) * GemmCoreT::TileN + in] =
+              _param.B[helper.item_g_n() + in + (koffset + helper.sg_idx_m() + icp * GemmCoreT::WgM) * _param.ldb];
+        }
+      }
+    }
+  }
+};
+
+class KernelConfigBase {
+ public:
+  static int constexpr SgSize = 16;
+  static int constexpr TileK = 16;
+  static int constexpr TileN = 2;
+};
+
+template <typename ScaleT>
+struct ParamWeightS4 {
+  const uint8_t* B;
+  const ScaleT* scale;
+  int ldb;
+};
+
+template <class GemmCoreT, typename ScaleT>
+class WeightS4 {
+ public:
+  using BType = typename GemmCoreT::TB;
+  using Param = ParamWeightS4<ScaleT>;
+
+  static inline void getWeight(const Param& _param, const sycl::local_accessor<BType, 1>& dstptr, int koffset,
+                               int blocksize, const sycl_utils::nd_item_helper<GemmCoreT>& helper) {
+    int constexpr Iter_PerWorker = (GemmCoreT::TileK + GemmCoreT::WgM - 1) / GemmCoreT::WgM;
+    ScaleT scale[GemmCoreT::TileN];
+    for (size_t in = 0; in < GemmCoreT::TileN; in += 1)
+      scale[in] = _param.scale[helper.item_g_n() + in + koffset / blocksize * _param.ldb];
+#pragma unroll
+    for (int icp = 0; icp < Iter_PerWorker; icp++) {
+      {
+        for (size_t in = 0; in < GemmCoreT::TileN; in += 2) {
+          auto tmps8 =
+              _param
+                  .B[(helper.item_g_n() + in + (koffset + helper.sg_idx_m() + icp * GemmCoreT::WgM) * _param.ldb) / 2];
+          dstptr[(helper.sg_idx_m() + icp * GemmCoreT::WgM) * GemmCoreT::WgNEle +
+                 (helper.sg_idx_n() * GemmCoreT::SgSize + helper.sg_id()) * GemmCoreT::TileN + in] =
+              static_cast<int8_t>((tmps8 & 0x0f) << 4) * scale[in];
+          dstptr[(helper.sg_idx_m() + icp * GemmCoreT::WgM) * GemmCoreT::WgNEle +
+                 (helper.sg_idx_n() * GemmCoreT::SgSize + helper.sg_id()) * GemmCoreT::TileN + in + 1] =
+              static_cast<int8_t>((tmps8 & 0xf0)) * scale[in + 1];
+        }
+      }
+    }
+  }
+
+  template <class KernelConfigBase>
+  static inline sycl::event dequant_s4(int n, int k, int blocksize, const Param& in, BType* outptr, sycl::queue* q) {
+    int constexpr SgSize = KernelConfigBase::SgSize;
+    int constexpr TileK = KernelConfigBase::TileK;
+    int constexpr TileN = KernelConfigBase::TileN;
+    int constexpr GroupN = SgSize * TileN;
+    int constexpr GroupK = TileK;
+    static_assert(TileN % 2 == 0);
+    assert(blocksize % TileK == 0);
+
+    int nsg_k = k / GroupK;
+    int nsg_n = n / GroupN;
+    sycl::range<1> group{SgSize};
+    sycl::range<1> problem{nsg_n * nsg_k * SgSize};
+    auto B_d = in.B;
+    auto S_d = in.scale;
+    int ldb = in.ldb;
+    auto deq_kernel = [&](sycl::handler& cgh) {
+      cgh.parallel_for(sycl::nd_range<1>(problem, group),
+                       [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(SgSize)]] {
+                         int g_idx = it.get_group(0);
+                         auto sg = it.get_sub_group();
+                         int sg_id = sg.get_local_id()[0];
+                         int g_idx_n = g_idx % nsg_n;
+                         int g_idx_k = g_idx / nsg_n;
+                         int g_n = g_idx_n * GroupN;
+                         int g_k = g_idx_k * GroupK;
+                         auto sptr = S_d + g_k / blocksize * ldb + g_n;
+                         auto bptr = B_d + (g_k * ldb + g_n) / 2;
+                         auto dbptr = outptr + g_k * n + g_n;
+                         float tmp[TileK * TileN];
+                         float scale[TileN];
+                         for (int in = 0; in < TileN; in += 1) {
+                           scale[in] = sptr[sg_id * TileN + in];
+                         }
+                         for (int ik = 0; ik < TileK; ik += 1) {
+                           for (int in = 0; in < TileN; in += 2) {
+                             uint8_t srcu8 = *(bptr + (ik * ldb + sg_id * TileN + in) / 2);
+                             tmp[ik * TileN + in] = static_cast<int8_t>((srcu8 & 0x0f) << 4) * scale[in];
+                             tmp[ik * TileN + in + 1] = static_cast<int8_t>((srcu8 & 0xf0)) * scale[in + 1];
+                           }
+                         }
+                         for (int ik = 0; ik < TileK; ik += 1) {
+                           for (int in = 0; in < TileN; in += 1) {
+                             dbptr[ik * n + sg_id * TileN + in] = tmp[ik * TileN + in];
+                           }
+                         }
+                       });
+    };
+    return q->submit(deq_kernel);
+  }
+};
+
+class KernelConfigTrans {
+ public:
+  static int constexpr SgSize = 16;
+  static int constexpr TileK = 32;
+  static int constexpr TileN = 1;
+};
+
+template <class GemmCoreT, typename ScaleT>
+class WeightS4Trans {
+ public:
+  using AType = typename GemmCoreT::TA;
+  using BType = typename GemmCoreT::TB;
+  using CType = typename GemmCoreT::TC;
+  using Param = ParamWeightS4<ScaleT>;
+
+  static inline void getWeight(const Param& _param, const sycl::local_accessor<BType, 1>& dstptr, int koffset,
+                               int blocksize, const sycl_utils::nd_item_helper<GemmCoreT>& helper) {
+    int constexpr LoadTileK = 2;
+    static_assert(GemmCoreT::TileK == (LoadTileK * GemmCoreT::SgSize));
+    int constexpr Iter_PerWorker = GemmCoreT::WgNEle / GemmCoreT::SgCount;
+    auto wldb = _param.ldb * blocksize;
+    int sgn = helper.wg_g_n() + helper.sg_group_id();
+    int sg_off = helper.sg_id() * LoadTileK * GemmCoreT::WgNEle;
+#pragma unroll
+    for (int icp = 0; icp < Iter_PerWorker; icp++) {
+      {
+        auto scale = _param.scale[(sgn + icp * GemmCoreT::SgCount) * _param.ldb + koffset / blocksize];
+        auto tmps8 = _param.B[((sgn + icp * GemmCoreT::SgCount) * wldb + (koffset + helper.sg_id() * LoadTileK)) / 2];
+        if constexpr (std::is_same_v<BType, sycl::half>) {
+          sycl::half2 tmpBf = {static_cast<int8_t>((tmps8 & 0x0f) << 4), static_cast<int8_t>((tmps8 & 0xf0))};
+          tmpBf *= scale;
+          dstptr[sg_off + helper.sg_group_id() + icp * GemmCoreT::SgCount] = tmpBf[0];
+          dstptr[sg_off + GemmCoreT::WgNEle + helper.sg_group_id() + icp * GemmCoreT::SgCount] = tmpBf[1];
+        } else {
+          dstptr[sg_off + helper.sg_group_id() + icp * GemmCoreT::SgCount] =
+              static_cast<int8_t>((tmps8 & 0x0f) << 4) * scale;
+          dstptr[sg_off + GemmCoreT::WgNEle + helper.sg_group_id() + icp * GemmCoreT::SgCount] =
+              static_cast<int8_t>((tmps8 & 0xf0)) * scale;
+        }
+      }
+    }
+  }
+
+  template <class KernelConfigBase>
+  static inline sycl::event dequant_s4(int n, int k, int blocksize, const Param& in, BType* outptr, sycl::queue* q) {
+    int constexpr SgSize = KernelConfigBase::SgSize;
+    int constexpr TileK = KernelConfigBase::TileK;
+    int constexpr TileN = KernelConfigBase::TileN;
+    int constexpr GroupN = TileN;
+    int constexpr SubGroupK = SgSize * TileK;
+    int constexpr GroupK = SgSize * TileK;
+    static_assert(TileN == 1);
+    assert(blocksize % TileK == 0);
+
+    int nsg_k = k / GroupK;
+    int nsg_n = n / GroupN;
+    sycl::range<1> group{SgSize};
+    sycl::range<1> problem{nsg_n * nsg_k * SgSize};
+    auto B_d = in.B;
+    auto S_d = in.scale;
+    int ldb = in.ldb;
+    int ldbn = in.ldb * blocksize;
+    auto deq_kernel = [&](sycl::handler& cgh) {
+      cgh.parallel_for(
+          sycl::nd_range<1>(problem, group), [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(SgSize)]] {
+            int g_idx = it.get_group(0);
+            auto sg = it.get_sub_group();
+            int sg_id = sg.get_local_id()[0];
+            int sg_group_id = sg.get_group_id()[0];
+            int g_idx_n = g_idx / nsg_k;
+            int g_idx_k = g_idx % nsg_k;
+            int g_n = g_idx_n * GroupN;
+            int g_k = g_idx_k * GroupK;
+            int sg_k = g_k + sg_group_id * SubGroupK;
+            auto sptr = S_d + sg_k / blocksize + g_n * ldb;
+            auto bptr = B_d + (sg_k + g_n * ldbn) / 2;
+            auto dbptr = outptr + sg_k + g_n * k;
+            float tmp[TileK];
+            int constexpr Unroll = 4;
+#pragma unroll
+            for (int ik = 0; ik < TileK; ik += Unroll) {
+              float dst[Unroll];
+              float scale = sptr[(ik * SgSize + sg_id * Unroll) / blocksize];
+              for (int ir = 0; ir < Unroll; ir += 2) {
+                uint8_t srcu8 = *(bptr + (ik * SgSize + sg_id * Unroll + ir) / 2);
+                dst[ir] = static_cast<int8_t>((srcu8 & 0x0f) << 4) * scale;
+                dst[ir + 1] = static_cast<int8_t>((srcu8 & 0xf0)) * scale;
+              }
+              *(sycl::vec<float, Unroll>*)&dbptr[ik * SgSize + sg_id * Unroll] = *(sycl::vec<float, Unroll>*)dst;
+            }
+          });
+    };
+    return q->submit(deq_kernel);
+  }
+
+#if 0
+  template <class NOTVALID>
+  static inline sycl::event dequant_s4_trans(int n, int k, int blocksize, const Param& in, BType* outptr,
+                                             sycl::queue* q) {
+    int constexpr SgSize = 16;
+    int constexpr TileK = 2;
+    int constexpr TileN = 16;
+    int constexpr GroupN = TileN;
+    int constexpr GroupK = SgSize * TileK;
+    assert(blocksize % TileK == 0);
+    static_assert(TileN == SgSize);
+    int nsg_k = k / GroupK;
+    int nsg_n = n / GroupN;
+    sycl::range<1> group{SgSize};
+    sycl::range<1> problem{nsg_n * nsg_k * SgSize};
+    auto B_d = in.B;
+    auto S_d = in.scale;
+    int ldb = in.ldb;
+    int ldbn = in.ldb * blocksize;
+    auto deq_kernel = [&](sycl::handler& cgh) {
+      cgh.parallel_for(sycl::nd_range<1>(problem, group),
+                       [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(SgSize)]] {
+                         int g_idx = it.get_group(0);
+                         auto sg = it.get_sub_group();
+                         int sg_id = sg.get_local_id()[0];
+                         int g_idx_n = g_idx / nsg_k;
+                         int g_idx_k = g_idx % nsg_k;
+                         int g_n = g_idx_n * GroupN;
+                         int g_k = g_idx_k * GroupK;
+                         auto sptr = S_d + g_k / blocksize + g_n * ldb;
+                         auto bptr = B_d + (g_k + g_n * ldbn) / 2;
+                         auto dbptr = outptr + g_k * n + g_n;
+                         float tmp[TileN * TileK];
+                         for (int in = 0; in < TileN; in++) {
+                           float scale = sptr[sg_id * TileK / blocksize + in * ldb];
+                           for (int ik = 0; ik < TileK; ik += 2) {
+                             uint8_t srcu8 = *(bptr + (sg_id * TileK + ik + in * ldbn) / 2);
+                             tmp[in * TileK + ik] = static_cast<int8_t>((srcu8 & 0x0f) << 4) * scale;
+                             tmp[in * TileK + ik + 1] = static_cast<int8_t>((srcu8 & 0xf0)) * scale;
+                           }
+                         }
+
+                         float tmpT[TileN * TileK];
+                         for (int ik = 0; ik < TileK; ik++) {
+                           for (int in = 0; in < TileN; in++) {
+                             for (int is = 0; is < SgSize; is++) {
+                               auto shlv = sg.shuffle(tmp[in * TileK + ik], is);
+                               if (sg_id == in) {
+                                 tmpT[ik * TileN + is] = shlv;
+                               }
+                             }
+                           }
+                         }
+                         for (int in = 0; in < TileN; in++) {
+                           for (int ik = 0; ik < TileK; ik++) {
+                             dbptr[sg_id + (in * TileK + ik) * n] = tmpT[ik * TileN + in];
+                           }
+                         }
+                       });
+    };
+    return q->submit(deq_kernel);
+  }
+#else
+  template <class NOTVALID>
+  static inline sycl::event dequant_s4_trans(int n, int k, int blocksize, const Param& in, BType* outptr,
+                                             sycl::queue* q) {
+    int constexpr SgSize = 16;
+    int constexpr TileK = 1;
+    int constexpr TileN = 16;
+    int constexpr GroupN = TileN;
+    int constexpr GroupK = SgSize * TileK;
+    assert(blocksize % TileK == 0);
+    static_assert(TileN == SgSize);
+    static_assert(TileK == 1);
+    int nsg_k = k / GroupK;
+    int nsg_n = n / GroupN;
+    sycl::range<1> group{SgSize};
+    sycl::range<1> problem{nsg_n * nsg_k * SgSize};
+    auto B_d = in.B;
+    auto S_d = in.scale;
+    int ldb = in.ldb;
+    int ldbn = in.ldb * blocksize;
+    auto deq_kernel = [&](sycl::handler& cgh) {
+      cgh.parallel_for(sycl::nd_range<1>(problem, group),
+                       [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(SgSize)]] {
+                         int g_idx = it.get_group(0);
+                         auto sg = it.get_sub_group();
+                         int sg_id = sg.get_local_id()[0];
+                         int g_idx_n = g_idx / nsg_k;
+                         int g_idx_k = g_idx % nsg_k;
+                         int g_n = g_idx_n * GroupN;
+                         int g_k = g_idx_k * GroupK;
+                         auto sptr = S_d + g_k / blocksize + g_n * ldb;
+                         auto bptr = B_d + (g_k + g_n * ldbn) / 2;
+                         auto dbptr = outptr + g_k * n + g_n;
+                         float tmp[TileN];
+                         bool high4 = sg_id % 2 != 0;
+                         for (int in = 0; in < TileN; in++) {
+                           float scale = sptr[sg_id * TileK / blocksize + in * ldb];
+                           uint8_t srcu8 = *(bptr + (sg_id * TileK + in * ldbn) / 2);
+                           tmp[in] = high4 ? static_cast<int8_t>((srcu8 & 0xf0)) * scale
+                                           : static_cast<int8_t>((srcu8 & 0x0f) << 4) * scale;
+                         }
+
+                         float tmpT[TileN];
+                         for (int in = 0; in < TileN; in++) {
+                           for (int is = 0; is < SgSize; is++) {
+                             auto shlv = sg.shuffle(tmp[in], is);
+                             if (sg_id == in) {
+                               tmpT[is] = shlv;
+                             }
+                           }
+                         }
+                         for (int in = 0; in < TileN; in++) {
+                           dbptr[sg_id + in * n] = tmpT[in];
+                         }
+                       });
+    };
+    return q->submit(deq_kernel);
+  }
+#endif
+
+  static inline sycl::event gemv(const AType* A, const Param& paramB, CType* C, int n, int k, int blocksize,
+                                 sycl::queue* q) {
+    auto B = paramB.B;
+    auto B_scale = paramB.scale;
+    int ldb = paramB.ldb;
+    int constexpr SgSize = 16;
+    int constexpr TileK = 32;
+    int constexpr GroupK = SgSize * TileK;
+    sycl::range<1> group{SgSize};
+    sycl::range<1> problem{n * SgSize};
+
+    auto ev = q->submit([&](sycl::handler& cgh) {
+      cgh.parallel_for(
+          sycl::nd_range<1>(problem, group),
+          [=](sycl::nd_item<1> it) [[cl::reqd_work_group_size(
+              1, 1, SgSize)]] [[intel::kernel_args_restrict]] [[intel::reqd_sub_group_size(SgSize)]] {
+            int g_idx = it.get_group(0);
+            auto sg = it.get_sub_group();
+            int sg_id = sg.get_local_id()[0];
+            int g_n = g_idx;
+            auto sptr = B_scale + g_n * ldb;
+            auto bptr = B + g_n * k / 2;
+            auto aptr = A;
+            auto cptr = C + g_n;
+            if constexpr (std::is_same_v<CType, sycl::half>) {
+              sycl::half2 tmpAcc = {0.f, 0.f};
+              int constexpr Unroll = 2;
+              for (int i = 0; i < k; i += GroupK * Unroll) {
+#pragma unroll
+                for (int iu = 0; iu < Unroll; iu++) {
+                  uint8_t tmps8[TileK / 2];
+                  *(sycl::vec<uint8_t, TileK / 2>*)tmps8 = *(sycl::vec<uint8_t, TileK / 2>*)(bptr + sg_id * TileK / 2);
+                  CType scale = *(sptr + sg_id * TileK / blocksize);
+#pragma unroll
+                  for (int ikk = 0; ikk < TileK; ikk += 2) {
+                    sycl::half2 tmpA = *(sycl::half2*)&aptr[sg_id * TileK + ikk];
+                    sycl::half2 tmpB = {static_cast<int8_t>((tmps8[ikk / 2] & 0x0f) << 4),
+                                        static_cast<int8_t>((tmps8[ikk / 2] & 0xf0))};
+                    tmpAcc += tmpA * tmpB * scale;
+                  }
+                  sptr += GroupK / blocksize;
+                  aptr += GroupK;
+                  bptr += GroupK / 2;
+                }
+              }
+              sycl::half2 sum = {0.f, 0.f};
+              for (int i = 0; i < SgSize; i += 1) {
+                sum += sg.shuffle(tmpAcc, i);
+              }
+              if (sg_id == 0) {
+                *cptr = sum[0] + sum[1];
+              }
+            } else {
+              CType tmpAcc = 0.f;
+              int constexpr Unroll = 2;
+              for (int i = 0; i < k; i += GroupK * Unroll) {
+#pragma unroll
+                for (int iu = 0; iu < Unroll; iu++) {
+                  uint8_t tmps8[TileK / 2];
+                  *(sycl::vec<uint8_t, TileK / 2>*)tmps8 = *(sycl::vec<uint8_t, TileK / 2>*)(bptr + sg_id * TileK / 2);
+                  CType scale = *(sptr + sg_id * TileK / blocksize);
+#pragma unroll
+                  for (int ikk = 0; ikk < TileK; ikk += 2) {
+                    tmpAcc +=
+                        CType(aptr[sg_id * TileK + ikk]) * static_cast<int8_t>((tmps8[ikk / 2] & 0x0f) << 4) * scale;
+                    tmpAcc +=
+                        CType(aptr[sg_id * TileK + ikk + 1]) * static_cast<int8_t>((tmps8[ikk / 2] & 0xf0)) * scale;
+                  }
+                  sptr += GroupK / blocksize;
+                  aptr += GroupK;
+                  bptr += GroupK / 2;
+                }
+              }
+              float sum = 0.f;
+              for (int i = 0; i < SgSize; i += 1) {
+                sum += sg.shuffle(tmpAcc, i);
+              }
+              if (sg_id == 0) {
+                *cptr = sum;
+              }
+            }
+          });
+    });
+    return ev;
+  }
+};
+}  // namespace sycl_prologue_b
+}  // namespace bestla
+#endif
diff --git a/bestla/bestla/sycl/sycl_utils.h b/bestla/bestla/sycl/sycl_utils.h
new file mode 100644
index 000000000..2cdf01626
--- /dev/null
+++ b/bestla/bestla/sycl/sycl_utils.h
@@ -0,0 +1,110 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#include "sycl_device.h"
+#include "bestla_utils.h"
+
+namespace bestla {
+namespace sycl_utils {
+
+struct sycl_deleter {
+  sycl::queue* queue_;
+  sycl_deleter(sycl::queue* _q) : queue_(_q) {}
+  template <class T>
+  void operator()(T* obj) const {
+    if (obj) {
+      sycl::free(obj, *queue_);
+    }
+  }
+};
+
+template <typename _T>
+struct sycl_vector {
+  sycl_vector(uint64_t _size = 0, sycl::queue* _q = nullptr) : size_(_size) {
+    if (_q && _size) {
+      resize(_size, _q);
+    }
+  }
+
+  void resize(uint64_t _size, sycl::queue* _q) {
+    size_ = _size;
+    _T* tmp = sycl::malloc_device<_T>(_size, *_q);
+    ptr_ = std::shared_ptr<_T>(tmp, sycl_deleter(_q));
+  }
+
+  inline uint64_t size() { return size_; }
+
+  inline _T* data() { return ptr_.get(); }
+
+  std::shared_ptr<_T> ptr_;
+  uint64_t size_;
+};
+
+template <typename T>
+__inline__ std::vector<T> sycl2host(const T* syclptr, size_t elecount, sycl::queue* q) {
+  std::vector<T> tmp(elecount);
+  q->memcpy(tmp.data(), syclptr, elecount * sizeof(T)).wait();
+  return tmp;
+}
+
+class event_helper {
+ public:
+  static float elapsed_time(sycl::event& evt) {
+    float t = 0.f;
+    const auto startKernExecutionTimePoint = evt.get_profiling_info<sycl::info::event_profiling::command_submit>();
+    const auto endKernExecutionTimePoint = evt.get_profiling_info<sycl::info::event_profiling::command_end>();
+    t = (endKernExecutionTimePoint - startKernExecutionTimePoint) / 1e6;
+    return t;
+  }
+
+  static float execute_time(sycl::event& evt) {
+    float t = 0.f;
+    const auto startKernExecutionTimePoint = evt.get_profiling_info<sycl::info::event_profiling::command_start>();
+    const auto endKernExecutionTimePoint = evt.get_profiling_info<sycl::info::event_profiling::command_end>();
+    t = (endKernExecutionTimePoint - startKernExecutionTimePoint) / 1e6;
+    return t;
+  }
+};
+template <class GemmCoreT>
+class nd_item_helper {
+ public:
+  const sycl::nd_item<2> it;
+  const sycl::sub_group sg;
+  nd_item_helper(sycl::nd_item<2>& _it) : it(_it), sg(it.get_sub_group()) {}
+
+  constexpr inline void local_barrier() const { it.barrier(sycl::access::fence_space::local_space); }
+
+  constexpr inline int sg_group_id() const { return sg.get_group_id()[0]; }
+
+  constexpr inline int wg_idx_m() const { return it.get_group(0); }
+  constexpr inline int wg_size_m() const { return GemmCoreT::WgM * GemmCoreT::TileM; }
+  constexpr inline int wg_g_m() const { return wg_idx_m() * wg_size_m(); }
+
+  constexpr inline int wg_idx_n() const { return it.get_group(1); }
+  constexpr inline int wg_size_n() const { return GemmCoreT::WgN * GemmCoreT::TileN; }
+  constexpr inline int wg_g_n() const { return wg_idx_n() * wg_size_n(); }
+
+  constexpr inline int sg_idx_m() const { return sg_group_id() / GemmCoreT::SgNStride; }
+  constexpr inline int sg_g_m() const { return wg_g_m() + sg_idx_m() * GemmCoreT::TileM; }
+
+  constexpr inline int sg_idx_n() const { return sg_group_id() % GemmCoreT::SgNStride; }
+  constexpr inline int sg_g_n() const { return wg_g_n() + sg_idx_n() * GemmCoreT::SgSize * GemmCoreT::TileN; }
+
+  constexpr inline int sg_id() const { return sg.get_local_id()[0]; }
+  constexpr inline int item_g_m() const { return sg_g_m(); }
+  constexpr inline int item_g_n() const { return sg_g_n() + sg_id() * GemmCoreT::TileN; }
+};
+
+}  // namespace sycl_utils
+}  // namespace bestla
diff --git a/bestla/bestla/sycl/sycl_wrapper.h b/bestla/bestla/sycl/sycl_wrapper.h
new file mode 100644
index 000000000..29dd84997
--- /dev/null
+++ b/bestla/bestla/sycl/sycl_wrapper.h
@@ -0,0 +1,216 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+
+#ifdef BTLA_SYCL
+#include <sycl/sycl.hpp>
+
+#include "bestla_utils.h"
+#include "sycl_utils.h"
+#include "sycl_device.h"
+#include "sycl_gemm.h"
+#include "sycl_epilogue.h"
+#include "sycl_prologue_a.h"
+#include "sycl_prologue_b.h"
+
+namespace bestla {
+namespace sycl_wrapper {
+template <template <class GCT> class ProAT, template <class GCT> class ProBT, template <class GCT> class EpiT,
+          class GemmCoreT>
+class Launcher {
+ public:
+  using GemmCore = GemmCoreT;
+  using PrologueA = ProAT<GemmCore>;
+  using PrologueB = ProBT<GemmCore>;
+  using Epilogue = EpiT<GemmCore>;
+  using AType = typename GemmCore::TA;
+  using AParam = typename PrologueA::Param;
+  using BType = typename GemmCore::TB;
+  using BParam = typename PrologueB::Param;
+  using CType = typename GemmCore::TC;
+  using ACCType = typename GemmCore::TACC;
+  using EpiParam = typename Epilogue::Param;
+  struct Param {
+    const AParam paramA;
+    const BParam paramB;
+    const EpiParam paramC;
+  };
+  template <bool debug = false>
+  static inline sycl::event compute(sycl::queue* q, int m, int n, int k, const Param& _param) {
+    sycl::range<2> group{GemmCore::WgM, GemmCore::WgN};
+    auto A = _param.paramA.A;
+    auto B = _param.paramB.B;
+    auto C = _param.paramC.C;
+    int lda = _param.paramA.lda;
+    int ldb = _param.paramB.ldb;
+    int ldc = _param.paramC.ldc;
+    int m_pad = utils::padto(utils::updiv(m, GemmCore::TileM), GemmCore::WgM);
+    sycl::range<2> problem{m_pad, n / GemmCore::TileN};
+    auto ev = q->submit([&](sycl::handler& cgh) {
+      sycl::local_accessor<BType, 1> slm_b(sycl::range(GemmCore::SLM_B_Size), cgh);
+      cgh.parallel_for(
+          sycl::nd_range<2>(problem, group),
+          [=](sycl::nd_item<2> it) [[cl::reqd_work_group_size(
+              1, GemmCore::WgM,
+              GemmCore::WgN)]] [[intel::kernel_args_restrict]] [[intel::reqd_sub_group_size(GemmCore::SgSize)]] {
+            sycl_utils::nd_item_helper<GemmCore> helper(it);
+            if constexpr (debug) {
+              compute_tile(k, B, ldb, slm_b, A, lda, C, ldc, it);
+            } else {
+              int m_tail = m - helper.sg_g_m();
+              m_tail = m_tail > GemmCore::TileM ? GemmCore::TileM : m_tail;
+              if (m_tail == GemmCore::TileM) {
+                compute_tile(k, B, ldb, slm_b, A, lda, C, ldc, it);
+              } else {
+                compute_tail(k, B, ldb, slm_b, A, lda, C, ldc, m_tail, it);
+              }
+            }
+          });
+    });
+    return ev;
+  }
+
+  static void compute_tile(int k, const BType* B, int ldb, const sycl::local_accessor<BType, 1>& slm_b, const AType* A,
+                           int lda, CType* C, int ldc, sycl::nd_item<2>& it) {
+    sycl_utils::nd_item_helper<GemmCore> helper(it);
+    ACCType tmp[GemmCore::TileM * GemmCore::TileN];
+    for (size_t im = 0; im < GemmCore::TileM; im++)
+      for (size_t in = 0; in < GemmCore::TileN; in++) tmp[im * GemmCore::TileN + in] = ACCType(0.f);
+#pragma forceinline recursive
+    for (int i = 0; i < k; i += GemmCore::TileK) {
+      PrologueB::getWeight({B, ldb}, slm_b, i, helper);
+      it.barrier(sycl::access::fence_space::local_space);
+      GemmCore::compute(&A[helper.item_g_m() * lda + i], lda, slm_b, tmp, helper);
+      it.barrier(sycl::access::fence_space::local_space);
+    }
+#pragma forceinline recursive
+    Epilogue::store({C, ldc}, tmp, helper);
+  }
+
+  static void compute_tail(int k, const BType* B, int ldb, const sycl::local_accessor<BType, 1>& slm_b, const AType* A,
+                           int lda, CType* C, int ldc, int m_tail, sycl::nd_item<2>& it) {
+    sycl_utils::nd_item_helper<GemmCore> helper(it);
+    ACCType tmp[GemmCore::TileM * GemmCore::TileN];
+    for (size_t im = 0; im < GemmCore::TileM; im++)
+      for (size_t in = 0; in < GemmCore::TileN; in++) tmp[im * GemmCore::TileN + in] = ACCType(0.f);
+#pragma forceinline recursive
+    for (int i = 0; i < k; i += GemmCore::TileK) {
+      PrologueB::getWeight({B, ldb}, slm_b, i, helper);
+      it.barrier(sycl::access::fence_space::local_space);
+      GemmCore::compute_mtail(&A[helper.item_g_m() * lda + i], lda, slm_b, tmp, helper, m_tail);
+      it.barrier(sycl::access::fence_space::local_space);
+    }
+#pragma forceinline recursive
+    Epilogue::store_tail({C, ldc}, tmp, helper, m_tail);
+  }
+};
+
+template <template <class GCT> class ProAT, template <class GCT> class ProBT, template <class GCT> class EpiT,
+          class GemmCoreT>
+class LauncherWOQ {
+ public:
+  using GemmCore = GemmCoreT;
+  using PrologueA = ProAT<GemmCore>;
+  using PrologueB = ProBT<GemmCore>;
+  using Epilogue = EpiT<GemmCore>;
+  using AType = typename GemmCore::TA;
+  using AParam = typename PrologueA::Param;
+  using BType = typename GemmCore::TB;
+  using BParam = typename PrologueB::Param;
+  using CType = typename GemmCore::TC;
+  using ACCType = typename GemmCore::TACC;
+  using EpiParam = typename Epilogue::Param;
+  struct Param {
+    const AParam paramA;
+    const BParam paramB;
+    const EpiParam paramC;
+  };
+
+  template <bool debug = false>
+  static inline sycl::event compute(sycl::queue* q, int m, int n, int k, int blocksize, const Param& _param) {
+    sycl::range<2> group{GemmCore::WgM, GemmCore::WgN};
+    auto A = _param.paramA.A;
+    auto B = _param.paramB.B;
+    auto B_scale = _param.paramB.scale;
+    auto C = _param.paramC.C;
+    int lda = _param.paramA.lda;
+    int ldb = _param.paramB.ldb;
+    int ldc = _param.paramC.ldc;
+    int m_pad = utils::padto(utils::updiv(m, GemmCore::TileM), GemmCore::WgM);
+    sycl::range<2> problem{m_pad, n / GemmCore::TileN};
+    auto ev = q->submit([&](sycl::handler& cgh) {
+      sycl::local_accessor<BType, 1> slm_b(sycl::range(GemmCore::SLM_B_Size), cgh);
+      cgh.parallel_for(
+          sycl::nd_range<2>(problem, group),
+          [=](sycl::nd_item<2> it) [[cl::reqd_work_group_size(
+              1, GemmCore::WgM,
+              GemmCore::WgN)]] [[intel::kernel_args_restrict]] [[intel::reqd_sub_group_size(GemmCore::SgSize)]] {
+            nd_item_helper<GemmCore> helper(it);
+            if constexpr (debug) {
+              compute_tile(k, blocksize, B, B_scale, ldb, slm_b, A, lda, C, ldc, it);
+            } else {
+              int m_tail = m - helper.sg_g_m();
+              m_tail = m_tail > GemmCore::TileM ? GemmCore::TileM : m_tail;
+              if (m_tail == GemmCore::TileM) {
+                compute_tile(k, blocksize, B, B_scale, ldb, slm_b, A, lda, C, ldc, it);
+              } else {
+                compute_tail(k, blocksize, m_tail, B, B_scale, ldb, slm_b, A, lda, C, ldc, it);
+              }
+            }
+          });
+    });
+    return ev;
+  }
+
+  template <typename ScaleT>
+  static void compute_tile(int k, int blocksize, const uint8_t* B, const ScaleT* B_scale, int ldb,
+                           const sycl::local_accessor<BType, 1>& slm_b, const AType* A, int lda, CType* C, int ldc,
+                           sycl::nd_item<2>& it) {
+    sycl_utils::nd_item_helper<GemmCore> helper(it);
+    ACCType tmp[GemmCore::TileM * GemmCore::TileN];
+    for (size_t im = 0; im < GemmCore::TileM; im++)
+      for (size_t in = 0; in < GemmCore::TileN; in++) tmp[im * GemmCore::TileN + in] = ACCType(0.f);
+#pragma forceinline recursive
+    for (int i = 0; i < k; i += GemmCore::TileK) {
+      PrologueB::getWeight({B, B_scale, ldb}, slm_b, i, blocksize, helper);
+      it.barrier(sycl::access::fence_space::local_space);
+      GemmCore::compute(&A[helper.item_g_m() * k + i], k, slm_b, tmp, helper);
+      it.barrier(sycl::access::fence_space::local_space);
+    }
+#pragma forceinline recursive
+    Epilogue::store({C, ldc}, tmp, helper);
+  }
+
+  template <typename ScaleT>
+  static void compute_tail(int k, int blocksize, int m_tail, const uint8_t* B, const ScaleT* B_scale, int ldb,
+                           const sycl::local_accessor<BType, 1>& slm_b, const AType* A, int lda, CType* C, int ldc,
+                           sycl::nd_item<2>& it) {
+    sycl_utils::nd_item_helper<GemmCore> helper(it);
+    ACCType tmp[GemmCore::TileM * GemmCore::TileN];
+    for (size_t im = 0; im < GemmCore::TileM; im++)
+      for (size_t in = 0; in < GemmCore::TileN; in++) tmp[im * GemmCore::TileN + in] = ACCType(0.f);
+#pragma forceinline recursive
+    for (int i = 0; i < k; i += GemmCore::TileK) {
+      PrologueB::getWeight({B, B_scale, ldb}, slm_b, i, blocksize, helper);
+      it.barrier(sycl::access::fence_space::local_space);
+      GemmCore::compute_mtail(&A[helper.item_g_m() * k + i], k, slm_b, tmp, helper, m_tail);
+      it.barrier(sycl::access::fence_space::local_space);
+    }
+#pragma forceinline recursive
+    Epilogue::store_tail({C, ldc}, tmp, helper, m_tail);
+  }
+};
+}  // namespace sycl_wrapper
+}  // namespace bestla
+#endif
diff --git a/bestla/bestla/ut/kernel_intrin.cpp b/bestla/bestla/ut/kernel_intrin.cpp
index db848bd82..fe444a4a6 100644
--- a/bestla/bestla/ut/kernel_intrin.cpp
+++ b/bestla/bestla/ut/kernel_intrin.cpp
@@ -4,6 +4,7 @@
 namespace bestla {
 using namespace utils;
 namespace ut {
+#if CompileAVX512F()
 class UT_Avx512f_decompress_kblock_s4_fp {
  public:
   UT_Avx512f_decompress_kblock_s4_fp() {
@@ -45,6 +46,8 @@ class UT_Avx512f_decompress_kblock_s4_fp {
 #ifdef BTLA_UT_KERNEL_INTRIN
 static UT_Avx512f_decompress_kblock_s4_fp sUT_Avx512f_decompress_kblock_s4_fp;
 #endif
+#endif
+#if CompileAVX2()
 class UT_avx2_decompress_s4_s8 {
  public:
   UT_avx2_decompress_s4_s8() {
@@ -77,5 +80,6 @@ class UT_avx2_decompress_s4_s8 {
 #ifdef BTLA_UT_KERNEL_INTRIN
 static UT_avx2_decompress_s4_s8 sUT_avx2_decompress_s4_s8;
 #endif
+#endif
 }  // namespace ut
 }  // namespace bestla
diff --git a/bestla/bestla/ut/sycl_benchmark.cpp b/bestla/bestla/ut/sycl_benchmark.cpp
new file mode 100644
index 000000000..3de21c57f
--- /dev/null
+++ b/bestla/bestla/ut/sycl_benchmark.cpp
@@ -0,0 +1,729 @@
+#include <stdio.h>
+#include "bestla_wrapper.h"
+#include "bestla_ut.h"
+#include "sycl_ut.h"
+#include "sycl/sycl_wrapper.h"
+
+namespace bestla {
+using namespace ut;
+using namespace utils;
+using namespace sycl_utils;
+using namespace sycl_gemm;
+namespace sycl_ut {
+int constexpr TestMs = 1000;
+class Benchmark_Fp32Fp32 {
+ public:
+  Benchmark_Fp32Fp32() {
+    UT_START();
+    benchmark_all(1024, 4096, 4096);
+    benchmark_all(4096, 4096, 4096);
+  }
+
+  using AType = float;
+  using BType = float;
+  using CType = float;
+  using SGemmT = xve::DefaultSGemmCore;
+  template <class GCT>
+  using ProAT = sycl_prologue_a::ActivationBase<GCT, float>;
+  template <class GCT>
+  using ProBT = sycl_prologue_b::WeightBase<GCT, float>;
+  template <class GCT>
+  using EpiT = sycl_epilogue::OutputBase<GCT, float>;
+  using KernelLauncher = sycl_wrapper::Launcher<ProAT, ProBT, EpiT, SGemmT>;
+
+  template <typename LOG_T>
+  void benchmark(int m, int n, int k, int batch, AType* A, BType* B, CType* C, float timems) {
+    LOG_T log;
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    utils::timer<std::chrono::milliseconds> tm;
+    auto A_d = A;
+    auto B_d = B;
+    auto C_d = C;
+    auto psize = (size_t)m * n * k * 2;
+    sycl::range<2> group{SGemmT::WgM, SGemmT::WgN};
+    sycl::range<2> problem{m / SGemmT::TileM, n / SGemmT::TileN};
+    utils::GemmProblem gp(1, m, n, k);
+    tm.start();
+    while (tm.stop() < timems) {
+      for (size_t i = 0; i < batch; i++) {
+        auto e_esimd = KernelLauncher::compute<false>(q, m, n, k, {{A, k}, {B, n}, {C, n}});
+        e_esimd.wait();
+        log.add(event_helper::execute_time(e_esimd) * 1000);
+        if (tm.stop() >= timems) {
+          break;
+        }
+      }
+    }
+    log.record();
+    double flops = double(psize) / log.min_val / 1e6;
+    printf(" %s Flops:%.3f\n", log.get_log_str(), flops);
+  }
+
+  void benchmark_all(int m, int n, int k) {
+    auto memsize = gemm_memsize(m, n, k, BTLA_DTYPE::F32, BTLA_DTYPE::F32, BTLA_DTYPE::F32);
+    auto batch = auto_batch(memsize);
+    printf("%d %d %d %d %s %s %s\n", m, n, k, batch, bestla_dtype_str(BTLA_DTYPE::F32),
+           bestla_dtype_str(BTLA_DTYPE::F32), bestla_dtype_str(BTLA_DTYPE::F32));
+    avector<AType> A(size_t(m) * k * batch);
+    avector<BType> B(size_t(k) * n * batch);
+    avector<CType> C(size_t(m) * n * batch, 0);
+    fill_buffer_randn(A.data(), m * k, -0.5f, 0.5f);
+    fill_buffer_randn(B.data(), n * k, -0.5f, 0.5f);
+    for (size_t i = 0; i < batch - 1; i++) {
+      memcpy(A.data() + i * m * k, A.data(), m * k * sizeof(AType));
+      memcpy(B.data() + i * n * k, B.data(), n * k * sizeof(BType));
+    }
+    using LOG = timer_statistics_logger<TestMs * 2>;
+    float testtime = float(TestMs);
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    sycl_vector<float> dA(A.size(), q), dB(B.size(), q), dC(C.size(), q);
+    q->memcpy(dA.data(), A.data(), A.size() * 4).wait();
+    q->memcpy(dB.data(), B.data(), B.size() * 4).wait();
+
+    benchmark<LOG>(m, n, k, batch, dA.data(), dB.data(), dC.data(), testtime);
+  }
+};
+#ifdef BTLA_UT_SYCL
+static Benchmark_Fp32Fp32 sBenchmark_Fp32Fp32;
+#endif
+class Benchmark_Fp16Fp16 {
+ public:
+  Benchmark_Fp16Fp16() {
+    UT_START();
+    benchmark_all(1024, 4096, 4096);
+    benchmark_all(4096, 4096, 4096);
+    benchmark_all(4096, 4096 * 3, 4096);
+  }
+
+  using AType = sycl::half;
+  using BType = sycl::half;
+  using CType = sycl::half;
+  using SGemmT = xve::DefaultHGemmCore;
+  template <class GCT>
+  using ProAT = sycl_prologue_a::ActivationBase<GCT, sycl::half>;
+  template <class GCT>
+  using ProBT = sycl_prologue_b::WeightBase<GCT, sycl::half>;
+  template <class GCT>
+  using EpiT = sycl_epilogue::OutputBase<GCT, sycl::half>;
+  using KernelLauncher = sycl_wrapper::Launcher<ProAT, ProBT, EpiT, SGemmT>;
+
+  template <typename LOG_T>
+  void benchmark(int m, int n, int k, int batch, AType* A, BType* B, CType* C, float timems) {
+    LOG_T log;
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    utils::timer<std::chrono::milliseconds> tm;
+    auto A_d = A;
+    auto B_d = B;
+    auto C_d = C;
+    auto psize = (size_t)m * n * k * 2;
+    tm.start();
+    while (tm.stop() < timems) {
+      for (size_t i = 0; i < batch; i++) {
+        auto e_esimd = KernelLauncher::compute<false>(q, m, n, k, {{A, k}, {B, n}, {C, n}});
+        e_esimd.wait();
+        log.add(event_helper::execute_time(e_esimd) * 1000);
+        if (tm.stop() >= timems) {
+          break;
+        }
+      }
+    }
+    log.record();
+    double flops = double(psize) / log.min_val / 1e6;
+    printf(" %s Flops:%.3f\n", log.get_log_str(), flops);
+  }
+
+  void benchmark_all(int m, int n, int k) {
+    auto memsize = gemm_memsize(m, n, k, BTLA_DTYPE::F32, BTLA_DTYPE::F32, BTLA_DTYPE::F32);
+    auto batch = auto_batch(memsize);
+    printf("%d %d %d %d %s %s %s\n", m, n, k, batch, bestla_dtype_str(BTLA_DTYPE::F32),
+           bestla_dtype_str(BTLA_DTYPE::F32), bestla_dtype_str(BTLA_DTYPE::F32));
+    using LOG = timer_statistics_logger<TestMs * 2>;
+    float testtime = float(TestMs);
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    sycl_vector<AType> dA(size_t(m) * k * batch, q);
+    sycl_vector<BType> dB(size_t(k) * n * batch, q);
+    sycl_vector<CType> dC(size_t(m) * n * batch, q);
+
+    benchmark<LOG>(m, n, k, batch, dA.data(), dB.data(), dC.data(), testtime);
+  }
+};
+#ifdef BTLA_UT_SYCL
+static Benchmark_Fp16Fp16 sBenchmark_Fp16Fp16;
+#endif
+
+class Benchmark_S4Fp32Fp32 {
+ public:
+  Benchmark_S4Fp32Fp32() {
+    UT_START();
+    benchmark_all(1, 4096, 4096);
+    benchmark_all(1, 4096, 4096 * 3);
+    benchmark_all(1, 4096 * 3, 4096);
+    benchmark_all(1024, 4096, 4096);
+    benchmark_all(4096, 4096, 4096);
+  }
+
+  using AType = float;
+  using BType = float;
+  using CType = float;
+  using SGemmT = xve::DefaultSGemmCore;
+  template <class GCT>
+  using ProAT = sycl_prologue_a::ActivationBase<GCT, float>;
+  template <class GCT>
+  using ProBT = sycl_prologue_b::WeightS4<GCT, float>;
+  template <class GCT>
+  using ProBTransT = sycl_prologue_b::WeightS4Trans<GCT, float>;
+  template <class GCT>
+  using EpiT = sycl_epilogue::OutputBase<GCT, float>;
+  using KernelLauncher = sycl_wrapper::LauncherWOQ<ProAT, ProBT, EpiT, SGemmT>;
+  using KernelLauncherT = sycl_wrapper::LauncherWOQ<ProAT, ProBTransT, EpiT, SGemmT>;
+
+  template <typename LOG_T>
+  void benchmark(int m, int n, int k, int batch, AType* A, uint8_t* B, float* B_scale, CType* C, float timems) {
+    LOG_T log;
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    utils::timer<std::chrono::milliseconds> tm;
+    auto A_d = A;
+    auto B_d = B;
+    auto C_d = C;
+    auto psize = (size_t)m * n * k * 2;
+    sycl::range<2> group{SGemmT::WgM, SGemmT::WgN};
+    sycl::range<2> problem{m / SGemmT::TileM, n / SGemmT::TileN};
+    utils::GemmProblem gp(1, m, n, k);
+    tm.start();
+    while (tm.stop() < timems) {
+      for (size_t i = 0; i < batch; i++) {
+        auto e_esimd = KernelLauncher::compute(q, m, n, k, 128, {{A, k}, {B, B_scale, n}, {C, n}});
+        e_esimd.wait();
+        log.add(event_helper::execute_time(e_esimd) * 1000);
+        if (tm.stop() >= timems) {
+          break;
+        }
+      }
+    }
+    log.record();
+    double flops = double(psize) / log.min_val / 1e6;
+    printf(" %s Flops:%.3f\n", log.get_log_str(), flops);
+  }
+
+  template <typename LOG_T>
+  void benchmarkT(int m, int n, int k, int batch, AType* A, uint8_t* B, float* B_scale, CType* C, float timems) {
+    LOG_T log;
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    utils::timer<std::chrono::milliseconds> tm;
+    auto A_d = A;
+    auto B_d = B;
+    auto C_d = C;
+    auto psize = (size_t)m * n * k * 2;
+    int blks = k / 32;
+    tm.start();
+    while (tm.stop() < timems) {
+      for (size_t i = 0; i < batch; i++) {
+        auto e_esimd = KernelLauncherT::compute(q, m, n, k, 128, {{A, k}, {B, B_scale, blks}, {C, n}});
+        e_esimd.wait();
+        log.add(event_helper::execute_time(e_esimd) * 1000);
+        if (tm.stop() >= timems) {
+          break;
+        }
+      }
+    }
+    log.record();
+    double flops = double(psize) / log.min_val / 1e6;
+    printf(" %s Flops:%.3f\n", log.get_log_str(), flops);
+  }
+
+  template <typename LOG_T>
+  void benchmark_gemv_T2(int m, int n, int k, int batch, AType* A, uint8_t* B, float* B_scale, CType* C, float timems) {
+    LOG_T log;
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    utils::timer<std::chrono::milliseconds> tm;
+    auto A_d = (const AType*)A;
+    auto B_d = B;
+    auto C_d = C;
+    auto S_d = B_scale;
+    auto psize = (size_t)m * n * k * 2;
+    int blocksize = 128;
+    int blks = k / blocksize;
+    tm.start();
+    while (tm.stop() < timems) {
+      for (size_t i = 0; i < batch; i++) {
+        int constexpr SgSize = 16;
+        int constexpr TileK = 32;
+        int constexpr GroupK = SgSize * TileK;
+        sycl::range<1> group{SgSize};
+        sycl::range<1> problem{n * SgSize};
+        auto e_esimd = ProBTransT<SGemmT>::gemv(A_d, {B_d, S_d, blks}, C_d, n, k, blocksize, q);
+        e_esimd.wait();
+        log.add(event_helper::execute_time(e_esimd) * 1000);
+        if (tm.stop() >= timems) {
+          break;
+        }
+      }
+    }
+    log.record();
+    double flops = double(psize) / log.min_val / 1e6;
+    printf(" %s Flops:%.3f\n", log.get_log_str(), flops);
+  }
+
+  void benchmark_all(int m, int n, int k) {
+    auto memsize = gemm_memsize(m, n, k, BTLA_DTYPE::F32, BTLA_DTYPE::F32, BTLA_DTYPE::F32);
+    auto batch = auto_batch(memsize);
+    printf("%d %d %d %d %s %s %s\n", m, n, k, batch, bestla_dtype_str(BTLA_DTYPE::F32),
+           bestla_dtype_str(BTLA_DTYPE::F32), bestla_dtype_str(BTLA_DTYPE::F32));
+    avector<AType> A(size_t(m) * k * batch);
+    avector<BType> B(size_t(k) * n * batch);
+    avector<float> B_scale(size_t(k) * n * batch);
+    avector<CType> C(size_t(m) * n * batch, 0);
+    fill_buffer_randn(A.data(), m * k, -0.5f, 0.5f);
+    fill_buffer_randn(B.data(), n * k, -0.5f, 0.5f);
+    fill_buffer_randn(B_scale.data(), n * k, -0.5f, 0.5f);
+    avector<uint8_t> B_s8(k * n * batch / 2);
+    fill_buffer_randn(B_s8.data(), B_s8.size(), uint8_t(0), uint8_t(255));
+    for (size_t i = 0; i < batch - 1; i++) {
+      memcpy(A.data() + i * m * k, A.data(), m * k * sizeof(AType));
+      memcpy(B.data() + i * n * k, B.data(), n * k * sizeof(BType));
+      memcpy(B_s8.data() + i * n * k / 2, B_s8.data(), n * k * sizeof(uint8_t) / 2);
+      memcpy(B_scale.data() + i * n * k, B_scale.data(), n * k * sizeof(float));
+    }
+    using LOG = timer_statistics_logger<TestMs * 2>;
+    float testtime = float(TestMs);
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    sycl_vector<float> dA(A.size(), q), dB(B.size(), q), dC(C.size(), q), dB_scale(B_scale.size(), q);
+    sycl_vector<uint8_t> dBs8(B_s8.size(), q);
+    q->memcpy(dA.data(), A.data(), A.size() * 4).wait();
+    q->memcpy(dB.data(), B.data(), B.size() * 4).wait();
+    if (m == 1) {
+      benchmark_gemv_T2<LOG>(m, n, k, batch, dA.data(), dBs8.data(), dB_scale.data(), dC.data(), testtime);
+    } else {
+      benchmarkT<LOG>(m, n, k, batch, dA.data(), dBs8.data(), dB_scale.data(), dC.data(), testtime);
+    }
+  }
+};
+#ifdef BTLA_UT_SYCL
+static Benchmark_S4Fp32Fp32 sBenchmark_S4Fp32Fp32;
+#endif
+
+class Benchmark_S4Fp16Fp16 {
+ public:
+  Benchmark_S4Fp16Fp16() {
+    UT_START();
+    benchmark_all(1, 4096, 4096, 128);
+    benchmark_all(1, 4096, 4096 * 4, 128);
+    benchmark_all(1, 4096 * 3, 4096, 128);
+    benchmark_all(1024, 4096, 4096, 32);
+    benchmark_all(2048, 4096 * 3, 4096, 32);
+  }
+
+  using AType = sycl::half;
+  using BType = sycl::half;
+  using CType = sycl::half;
+  using GemmT = xve::DefaultHGemmCore;
+  template <class GCT>
+  using ProAT = sycl_prologue_a::ActivationBase<GCT, sycl::half>;
+  template <class GCT>
+  using ProBT = sycl_prologue_b::WeightS4<GCT, sycl::half>;
+  template <class GCT>
+  using ProBTransT = sycl_prologue_b::WeightS4Trans<GCT, sycl::half>;
+  template <class GCT>
+  using EpiT = sycl_epilogue::OutputBase<GCT, sycl::half>;
+  using KernelLauncher = sycl_wrapper::LauncherWOQ<ProAT, ProBT, EpiT, GemmT>;
+  using KernelTLauncher = sycl_wrapper::LauncherWOQ<ProAT, ProBTransT, EpiT, GemmT>;
+
+  template <typename LOG_T>
+  void benchmark_gemm(int m, int n, int k, int blocksize, int batch, AType* A, uint8_t* B, BType* B_scale, CType* C,
+                      float timems) {
+    LOG_T log;
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    utils::timer<std::chrono::milliseconds> tm;
+    auto A_d = (const AType*)A;
+    auto B_d = B;
+    auto C_d = C;
+    auto S_d = B_scale;
+    auto psize = (size_t)m * n * k * 2;
+    int blks = k / blocksize;
+    tm.start();
+    while (tm.stop() < timems) {
+      for (size_t i = 0; i < batch; i++) {
+        auto e_esimd = KernelLauncher::compute(q, m, n, k, blocksize, {{A_d, k}, {B_d, S_d, n}, {C_d, n}});
+        e_esimd.wait();
+        log.add(event_helper::execute_time(e_esimd) * 1000);
+        if (tm.stop() >= timems) {
+          break;
+        }
+      }
+    }
+    log.record();
+    double flops = double(psize) / log.min_val / 1e6;
+    printf(" %s Flops:%.3f\n", log.get_log_str(), flops);
+  }
+
+  template <typename LOG_T>
+  void benchmark_gemmT(int m, int n, int k, int blocksize, int batch, AType* A, uint8_t* B, BType* B_scale, CType* C,
+                       float timems) {
+    LOG_T log;
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    utils::timer<std::chrono::milliseconds> tm;
+    auto A_d = (const AType*)A;
+    auto B_d = B;
+    auto C_d = C;
+    auto S_d = B_scale;
+    auto psize = (size_t)m * n * k * 2;
+    int blks = k / blocksize;
+    tm.start();
+    while (tm.stop() < timems) {
+      for (size_t i = 0; i < batch; i++) {
+        auto e_esimd = KernelTLauncher::compute({m, n, k, blocksize, {A_d, k}, {B_d, S_d, blks}, {C_d, n}}, q);
+        e_esimd.wait();
+        log.add(event_helper::execute_time(e_esimd) * 1000);
+        if (tm.stop() >= timems) {
+          break;
+        }
+      }
+    }
+    log.record();
+    double flops = double(psize) / log.min_val / 1e6;
+    printf(" %s Flops:%.3f\n", log.get_log_str(), flops);
+  }
+
+  template <typename LOG_T>
+  void benchmark_gemmT_DQ(int m, int n, int k, int blocksize, int batch, AType* A, uint8_t* B, BType* B_scale,
+                          BType* DQB, CType* C, float timems) {
+    LOG_T log;
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    utils::timer<std::chrono::milliseconds> tm;
+    auto A_d = (const AType*)A;
+    auto B_d = B;
+    auto C_d = C;
+    auto S_d = B_scale;
+    auto psize = (size_t)m * n * k * 2;
+    int blks = k / blocksize;
+    tm.start();
+    while (tm.stop() < timems) {
+      for (size_t i = 0; i < batch; i++) {
+        auto e_esimd = KernelTLauncher::compute({m, n, k, blocksize, {A_d, k}, {B_d, S_d, blks}, {C_d, n}}, q);
+        e_esimd.wait();
+        log.add(event_helper::execute_time(e_esimd) * 1000);
+        if (tm.stop() >= timems) {
+          break;
+        }
+      }
+    }
+    log.record();
+    double flops = double(psize) / log.min_val / 1e6;
+    printf(" %s Flops:%.3f\n", log.get_log_str(), flops);
+  }
+  template <typename LOG_T>
+  void benchmark_gemv_T2(int m, int n, int k, int blocksize, int batch, AType* A, uint8_t* B, BType* B_scale, CType* C,
+                         float timems) {
+    LOG_T log;
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    utils::timer<std::chrono::milliseconds> tm;
+    auto A_d = (const AType*)A;
+    auto B_d = B;
+    auto C_d = C;
+    auto S_d = B_scale;
+    auto psize = (size_t)m * n * k * 2;
+    int blks = k / blocksize;
+    tm.start();
+    while (tm.stop() < timems) {
+      for (size_t i = 0; i < batch; i++) {
+        auto e_esimd = ProBTransT<GemmT>::gemv(A_d, {B_d, S_d, blks}, C_d, n, k, blocksize, q);
+        e_esimd.wait();
+        log.add(event_helper::execute_time(e_esimd) * 1000);
+        if (tm.stop() >= timems) {
+          break;
+        }
+      }
+    }
+    log.record();
+    double flops = double(psize) / log.min_val / 1e6;
+    printf(" %s Flops:%.3f\n", log.get_log_str(), flops);
+  }
+
+  void benchmark_all(int m, int n, int k, int blocksize) {
+    auto memsize = gemm_memsize(m, n, k, BTLA_DTYPE::F16, BTLA_DTYPE::F16, BTLA_DTYPE::F16);
+    auto batch = auto_batch(memsize);
+    printf("%d %d %d %d %s %s %s\n", m, n, k, batch, bestla_dtype_str(BTLA_DTYPE::F16),
+           bestla_dtype_str(BTLA_DTYPE::F16), bestla_dtype_str(BTLA_DTYPE::F16));
+
+    using LOG = timer_statistics_logger<TestMs * 2>;
+    float testtime = float(TestMs);
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    int blks = k / blocksize;
+    sycl_vector<AType> dA(size_t(m) * k * batch, q);
+    sycl_vector<CType> dC(size_t(m) * n * batch, q);
+    sycl_vector<BType> dB_scale(blks * n * batch, q);
+    sycl_vector<uint8_t> dBs8(size_t(n) * k * batch / 2, q);
+    if (m == 1) {
+      benchmark_gemv_T2<LOG>(m, n, k, blocksize, batch, dA.data(), dBs8.data(), dB_scale.data(), dC.data(), testtime);
+    } else {
+      benchmark_gemm<LOG>(m, n, k, blocksize, batch, dA.data(), dBs8.data(), dB_scale.data(), dC.data(), testtime);
+    }
+  }
+};
+#ifdef BTLA_UT_SYCL
+static Benchmark_S4Fp16Fp16 sBenchmark_S4Fp16Fp16;
+#endif
+
+class Benchmark_DequantS4 {
+ public:
+  Benchmark_DequantS4() {
+    UT_START();
+    benchmark_all_reorder_back(4096, 4096, 32);
+    // benchmark_all_reorder_back_half(4096, 4096, 32);
+    benchmark_all_reorder(4096, 4096, 32);
+    benchmark_all_reorder(16384, 4096, 32);
+    benchmark_all(4096, 4096, 32);
+    benchmark_all(16384, 4096, 32);
+    benchmark_all(16384, 16384, 32);
+    benchmark_memcpy(2480, 4096, 32);
+    benchmark_memcpy(16384, 16384, 32);
+  }
+  void benchmark_all_reorder_back(int n, int k, int blocksize) {
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    printf("Test Case %s: %d %d %d Device:%s\n", __FUNCTION__, n, k, blocksize, dev->getName().c_str());
+    avector<uint8_t> rawB(k * n / 2);
+    int blks = updiv(k, blocksize);
+    avector<float> scale(blks * n), dequant(n * k), ref(n * k);
+    fill_buffer_randn(scale.data(), scale.size(), 0.01f, 0.03f);
+    fill_buffer_randn(rawB.data(), rawB.size(), uint8_t(0), uint8_t(255));
+    auto srcptr = (utils::int4x2*)rawB.data();
+    for (int j = 0; j < n; j += 1) {
+      for (int i = 0; i < k; i += 2) {
+        auto tmp = srcptr[i / 2 + j * k / 2];
+        auto noffset = i / blocksize + j * blks;
+        ref[i + j * k] = static_cast<float>(static_cast<int8_t>(tmp.x) << 4) * scale[noffset];
+        ref[i + 1 + j * k] = static_cast<float>(static_cast<int8_t>(tmp.y) << 4) * scale[noffset];
+      }
+    }
+    sycl_vector<float> dS(scale.size(), q), dequantB(n * k, q);
+    sycl_vector<uint8_t> dB(rawB.size(), q);
+    q->memcpy(dS.data(), scale.data(), scale.size() * 4).wait();
+    q->memcpy(dB.data(), rawB.data(), rawB.size() * 1).wait();
+
+    auto S_d = dS.data();
+    auto B_d = dB.data();
+    auto DB_d = dequantB.data();
+    using LOG = timer_statistics_logger<TestMs * 2>;
+    LOG log;
+    using ProB = sycl_prologue_b::WeightS4Trans<sycl_gemm::xve::DefaultSGemmCore, float>;
+    utils::timer<std::chrono::milliseconds> tm;
+    tm.start();
+    while (tm.stop() < TestMs) {
+      for (size_t i = 0; i < 1; i++) {
+        auto e_esimd =
+            ProB::dequant_s4_trans<sycl_prologue_b::KernelConfigTrans>(n, k, blocksize, {B_d, S_d, blks}, DB_d, q);
+        log.add(event_helper::execute_time(e_esimd) * 1000);
+        if (tm.stop() >= TestMs) {
+          break;
+        }
+      }
+    }
+    avector<float> refNT(k * n);
+    kernel::wrapper::Transpose2D<float>::forward<BTLA_ISA::NoSIMD>(ref.data(), refNT.data(), n, k, k, n);
+    q->memcpy(dequant.data(), DB_d, dequant.size() * 4).wait();
+    buffer_error(refNT.data(), dequant.data(), dequant.size(), 0.001f);
+    log.record();
+    auto psize = (size_t)n * k * 4 + n * k / 2 + n * k / blocksize * 4;
+    double flops = double(psize) / log.min_val / 1e6;
+    printf(" %s Memory Bandwidth:%.3f\n", log.get_log_str(), flops);
+  }
+
+  void benchmark_all_reorder_back_half(int n, int k, int blocksize) {
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    printf("Test Case %s: %d %d %d Device:%s\n", __FUNCTION__, n, k, blocksize, dev->getName().c_str());
+    avector<uint8_t> rawB(k * n / 2);
+    int blks = updiv(k, blocksize);
+    avector<utils::fp16> scale(blks * n), dequant(n * k), ref(n * k);
+    fill_buffer_randn(scale.data(), scale.size(), utils::fp16(0.01f), utils::fp16(0.03f));
+    fill_buffer_randn(rawB.data(), rawB.size(), uint8_t(0), uint8_t(255));
+    auto srcptr = (utils::int4x2*)rawB.data();
+    for (int j = 0; j < n; j += 1) {
+      for (int i = 0; i < k; i += 2) {
+        auto tmp = srcptr[i / 2 + j * k / 2];
+        auto noffset = i / blocksize + j * blks;
+        auto s = float(scale[noffset]);
+        ref[i + j * k] = static_cast<float>(static_cast<int8_t>(tmp.x) << 4) * s;
+        ref[i + 1 + j * k] = static_cast<float>(static_cast<int8_t>(tmp.y) << 4) * s;
+      }
+    }
+    sycl_vector<sycl::half> dS(scale.size(), q), dequantB(n * k, q);
+    sycl_vector<uint8_t> dB(rawB.size(), q);
+    q->memcpy(dS.data(), scale.data(), scale.size() * 2).wait();
+    q->memcpy(dB.data(), rawB.data(), rawB.size() * 1).wait();
+
+    auto S_d = dS.data();
+    auto B_d = dB.data();
+    auto DB_d = dequantB.data();
+    using LOG = timer_statistics_logger<TestMs * 2>;
+    LOG log;
+    using ProB = sycl_prologue_b::WeightS4Trans<sycl_gemm::xve::DefaultHGemmCore, sycl::half>;
+    utils::timer<std::chrono::milliseconds> tm;
+    tm.start();
+    while (tm.stop() < TestMs) {
+      for (size_t i = 0; i < 1; i++) {
+        auto e_esimd =
+            ProB::dequant_s4_trans<sycl_prologue_b::KernelConfigTrans>(n, k, blocksize, {B_d, S_d, blks}, DB_d, q);
+        log.add(event_helper::execute_time(e_esimd) * 1000);
+        if (tm.stop() >= TestMs) {
+          break;
+        }
+      }
+    }
+    avector<utils::fp16> refNT(k * n);
+    kernel::wrapper::Transpose2D<utils::fp16>::forward<BTLA_ISA::NoSIMD>(ref.data(), refNT.data(), n, k, k, n);
+    q->memcpy(dequant.data(), DB_d, dequant.size() * 2).wait();
+    buffer_error(refNT.data(), dequant.data(), dequant.size(), utils::fp16(0.001f));
+    log.record();
+    auto psize = (size_t)n * k * 2 + n * k / 2 + n * k / blocksize * 2;
+    double flops = double(psize) / log.min_val / 1e6;
+    printf(" %s Memory Bandwidth:%.3f\n", log.get_log_str(), flops);
+  }
+
+  void benchmark_all_reorder(int n, int k, int blocksize) {
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    printf("Test Case %s: %d %d %d Device:%s\n", __FUNCTION__, n, k, blocksize, dev->getName().c_str());
+    avector<uint8_t> rawB(k * n / 2);
+    int blks = updiv(k, blocksize);
+    avector<float> scale(blks * n), dequant(n * k), ref(n * k);
+    fill_buffer_randn(scale.data(), scale.size(), 0.01f, 0.03f);
+    fill_buffer_randn(rawB.data(), rawB.size(), uint8_t(0), uint8_t(255));
+    auto srcptr = (utils::int4x2*)rawB.data();
+    for (int j = 0; j < n; j += 1) {
+      for (int i = 0; i < k; i += 2) {
+        auto tmp = srcptr[i / 2 + j * k / 2];
+        auto noffset = i / blocksize + j * blks;
+        ref[i + j * k] = static_cast<float>(static_cast<int8_t>(tmp.x) << 4) * scale[noffset];
+        ref[i + 1 + j * k] = static_cast<float>(static_cast<int8_t>(tmp.y) << 4) * scale[noffset];
+      }
+    }
+    sycl_vector<float> dS(scale.size(), q), dequantB(n * k, q);
+    sycl_vector<uint8_t> dB(rawB.size(), q);
+    q->memcpy(dS.data(), scale.data(), scale.size() * 4).wait();
+    q->memcpy(dB.data(), rawB.data(), rawB.size() * 1).wait();
+
+    auto S_d = dS.data();
+    auto B_d = dB.data();
+    auto DB_d = dequantB.data();
+    using LOG = timer_statistics_logger<TestMs * 2>;
+    LOG log;
+    using ProB = sycl_prologue_b::WeightS4Trans<sycl_gemm::xve::DefaultSGemmCore, float>;
+    utils::timer<std::chrono::milliseconds> tm;
+    tm.start();
+    while (tm.stop() < TestMs) {
+      for (size_t i = 0; i < 1; i++) {
+        auto e_esimd = ProB::dequant_s4<sycl_prologue_b::KernelConfigTrans>(n, k, blocksize, {B_d, S_d, blks}, DB_d, q);
+        log.add(event_helper::execute_time(e_esimd) * 1000);
+        if (tm.stop() >= TestMs) {
+          break;
+        }
+      }
+    }
+
+    q->memcpy(dequant.data(), DB_d, dequant.size() * 4).wait();
+    buffer_error(ref.data(), dequant.data(), dequant.size(), 0.001f);
+    log.record();
+    auto psize = (size_t)n * k * 4 + n * k / 2 + n * k / blocksize * 4;
+    double flops = double(psize) / log.min_val / 1e6;
+    printf(" %s Memory Bandwidth:%.3f\n", log.get_log_str(), flops);
+  }
+
+  void benchmark_all(int n, int k, int blocksize) {
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    printf("Test Case %s: %d %d %d Device:%s\n", __FUNCTION__, n, k, blocksize, dev->getName().c_str());
+    avector<uint8_t> rawB(k * n / 2);
+    int blks = updiv(k, blocksize);
+    avector<float> scale(blks * n), dequant(n * k), ref(n * k);
+    fill_buffer_randn(scale.data(), scale.size(), 0.01f, 0.03f);
+    fill_buffer_randn(rawB.data(), rawB.size(), uint8_t(0), uint8_t(255));
+    auto srcptr = (utils::int4x2*)rawB.data();
+    for (int i = 0; i < k; i++) {
+      for (int j = 0; j < n; j += 2) {
+        auto tmp = srcptr[i * n / 2 + j / 2];
+        auto noffset = i / blocksize * n + j;
+        ref[i * n + j + 0] = static_cast<float>(static_cast<int8_t>(tmp.x) << 4) * scale[noffset + 0];
+        ref[i * n + j + 1] = static_cast<float>(static_cast<int8_t>(tmp.y) << 4) * scale[noffset + 1];
+      }
+    }
+    sycl_vector<float> dS(scale.size(), q), dequantB(n * k, q);
+    sycl_vector<uint8_t> dB(rawB.size(), q);
+    q->memcpy(dS.data(), scale.data(), scale.size() * 4).wait();
+    q->memcpy(dB.data(), rawB.data(), rawB.size() * 1).wait();
+
+    using LOG = timer_statistics_logger<TestMs * 2>;
+    LOG log;
+    utils::timer<std::chrono::milliseconds> tm;
+    using ProB = sycl_prologue_b::WeightS4<sycl_gemm::xve::DefaultSGemmCore, float>;
+    tm.start();
+    while (tm.stop() < TestMs) {
+      for (size_t i = 0; i < 1; i++) {
+        auto e_esimd = ProB::dequant_s4<sycl_prologue_b::KernelConfigBase>(n, k, blocksize, {dB.data(), dS.data(), n},
+                                                                           dequantB.data(), q);
+        e_esimd.wait();
+        log.add(event_helper::execute_time(e_esimd) * 1000);
+        if (tm.stop() >= TestMs) {
+          break;
+        }
+      }
+    }
+
+    q->memcpy(dequant.data(), dequantB.data(), dequant.size() * 4).wait();
+    buffer_error(ref.data(), dequant.data(), dequant.size(), 0.001f);
+    log.record();
+    auto psize = (size_t)n * k * 4 + n * k / 2 + n * k / blocksize * 4;
+    double flops = double(psize) / log.min_val / 1e6;
+    printf(" %s Memory Bandwidth:%.3f\n", log.get_log_str(), flops);
+  }
+
+  void benchmark_memcpy(int n, int k, int blocksize) {
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    printf("Test Case %s: %d %d %d Device:%s\n", __FUNCTION__, n, k, blocksize, dev->getName().c_str());
+    avector<float> dequant(n * k);
+    fill_buffer_randn(dequant.data(), dequant.size(), 0.01f, 0.03f);
+    sycl_vector<float> dequantB0(n * k, q);
+    sycl_vector<float> dequantB1(n * k, q);
+    q->memcpy(dequantB0.data(), dequant.data(), dequant.size() * 4).wait();
+
+    using LOG = timer_statistics_logger<TestMs * 2>;
+    LOG log;
+    utils::timer<std::chrono::milliseconds> tm;
+    tm.start();
+    while (tm.stop() < TestMs) {
+      for (size_t i = 0; i < 1; i++) {
+        auto e = q->memcpy(dequantB1.data(), dequantB0.data(), dequantB0.size() * 4);
+        e.wait();
+        log.add(event_helper::execute_time(e) * 1000);
+        if (tm.stop() >= TestMs) {
+          break;
+        }
+      }
+    }
+
+    log.record();
+    auto psize = (size_t)n * k * 4 * 2;
+    double flops = double(psize) / log.min_val / 1e6;
+    printf(" %s Memory Bandwidth:%.3f\n", log.get_log_str(), flops);
+  }
+};
+#ifdef BTLA_UT_SYCL
+static Benchmark_DequantS4 sBenchmark_DequantS4;
+#endif
+}  // namespace sycl_ut
+}  // namespace bestla
diff --git a/bestla/bestla/ut/sycl_gemm.cpp b/bestla/bestla/ut/sycl_gemm.cpp
new file mode 100644
index 000000000..d715e3afd
--- /dev/null
+++ b/bestla/bestla/ut/sycl_gemm.cpp
@@ -0,0 +1,485 @@
+#include "bestla_ut.h"
+#include "sycl_ut.h"
+#include "../sycl/sycl_wrapper.h"
+#include "bestla_prologue_b.h"
+
+namespace bestla {
+using namespace ut;
+using namespace utils;
+using namespace sycl_utils;
+using namespace sycl_gemm;
+namespace sycl_ut {
+class UT_SyclSGemm {
+ public:
+  UT_SyclSGemm() {
+    UT_START();
+    ut(1, 1024, 1024);
+    ut(300, 1024, 1024);
+    ut(1024, 1024, 1024);
+  }
+  using SGemmT = xve::DefaultSGemmCore;
+  template <class GCT>
+  using ProAT = sycl_prologue_a::ActivationBase<GCT, float>;
+  template <class GCT>
+  using ProBT = sycl_prologue_b::WeightBase<GCT, float>;
+  template <class GCT>
+  using EpiT = sycl_epilogue::OutputBase<GCT, float>;
+  using KernelLauncher = sycl_wrapper::Launcher<ProAT, ProBT, EpiT, SGemmT>;
+
+  void ut(int m, int n, int k) {
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    printf("Test Case: %d %d %d Device:%s\n", m, n, k, dev->getName().c_str());
+    avector<float> matA(m * k), matB(k * n), matC(m * n), ref(m * n);
+    fill_buffer_randn(matA.data(), matA.size(), -0.5f, 0.5f);
+    fill_buffer_randn(matB.data(), matB.size(), -0.5f, 0.5f);
+    gemmref_fp32fp32fp32(m, n, k, matA.data(), matB.data(), ref.data(), k, n, n);
+    sycl_vector<float> dA(matA.size(), q), dB(matB.size(), q), dC(matC.size(), q);
+    q->memcpy(dA.data(), matA.data(), matA.size() * 4).wait();
+    q->memcpy(dB.data(), matB.data(), matB.size() * 4).wait();
+    auto A_d = dA.data();
+    auto B_d = dB.data();
+    auto C_d = dC.data();
+    auto e_esimd = KernelLauncher::compute(q, m, n, k, {{A_d, k}, {B_d, n}, {C_d, n}});
+    e_esimd.wait();
+    q->memcpy(matC.data(), C_d, matC.size() * 4).wait();
+    buffer_error(ref.data(), matC.data(), ref.size(), 0.001f);
+  }
+};
+#ifdef BTLA_UT_SYCL
+static UT_SyclSGemm sUT_SyclSGemm;
+#endif
+
+class UT_SyclHGemm {
+ public:
+  UT_SyclHGemm() {
+    UT_START();
+    ut(1, 1024, 1024);
+    ut(300, 1024, 1024);
+    ut(1024, 1024, 1024);
+    ut(1033, 1024, 1024);
+  }
+  using SGemmT = xve::DefaultHGemmCore;
+  template <class GCT>
+  using ProAT = sycl_prologue_a::ActivationBase<GCT, sycl::half>;
+  template <class GCT>
+  using ProBT = sycl_prologue_b::WeightBase<GCT, sycl::half>;
+  template <class GCT>
+  using EpiT = sycl_epilogue::OutputBase<GCT, sycl::half>;
+  using KernelLauncher = sycl_wrapper::Launcher<ProAT, ProBT, EpiT, SGemmT>;
+
+  void ut(int m, int n, int k) {
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    printf("Test Case: %d %d %d Device:%s\n", m, n, k, dev->getName().c_str());
+    avector<utils::fp16> matA(m * k), matB(k * n), matC(m * n), ref(m * n);
+    fill_buffer_randn(matA.data(), matA.size(), utils::fp16(-0.5f), utils::fp16(0.5f));
+    fill_buffer_randn(matB.data(), matB.size(), utils::fp16(-0.5f), utils::fp16(0.5f));
+    gemmref_fp16fp16fp16(m, n, k, matA.data(), matB.data(), ref.data(), k, n, n);
+
+    sycl_vector<sycl::half> dA(matA.size(), q), dB(matB.size(), q), dC(matC.size(), q);
+    q->memcpy(dA.data(), matA.data(), matA.size() * 2).wait();
+    q->memcpy(dB.data(), matB.data(), matB.size() * 2).wait();
+    auto A_d = dA.data();
+    auto B_d = dB.data();
+    auto C_d = dC.data();
+    auto e_esimd = KernelLauncher::compute(q, m, n, k, {{A_d, k}, {B_d, n}, {C_d, n}});
+    e_esimd.wait();
+    q->memcpy(matC.data(), C_d, matC.size() * 2).wait();
+    buffer_error(ref.data(), matC.data(), ref.size(), utils::fp16(0.2f));
+  }
+};
+#ifdef BTLA_UT_SYCL
+static UT_SyclHGemm sUT_SyclHGemm;
+#endif
+
+class UT_SyclS4SGemm {
+ public:
+  UT_SyclS4SGemm() {
+    UT_START();
+    ut(300, 1024, 1024, 32);
+    ut(1024, 1024, 1024, 32);
+    utT(1024, 1024, 1024, 32);
+  }
+  using SGemm_t = xve::DefaultSGemmCore;
+  template <class GCT>
+  using ProAT = sycl_prologue_a::ActivationBase<GCT, float>;
+  template <class GCT>
+  using ProBT = sycl_prologue_b::WeightS4<GCT, float>;
+  template <class GCT>
+  using ProBTransT = sycl_prologue_b::WeightS4Trans<GCT, float>;
+  template <class GCT>
+  using EpiT = sycl_epilogue::OutputBase<GCT, float>;
+  using KernelLauncher = sycl_wrapper::LauncherWOQ<ProAT, ProBT, EpiT, SGemm_t>;
+  using KernelTLauncher = sycl_wrapper::LauncherWOQ<ProAT, ProBTransT, EpiT, SGemm_t>;
+
+  void ut(int m, int n, int k, int blocksize) {
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    printf("Test Case: %d %d %d Device:%s\n", m, n, k, dev->getName().c_str());
+    avector<float> matA(m * k), matB(k * n), matC(m * n), ref(m * n);
+    fill_buffer_randn(matA.data(), matA.size(), -0.5f, 0.5f);
+    int blks = k / blocksize;
+    avector<float> B_scale(size_t(blks) * n);
+    avector<uint8_t> B_s8(k * n / 2);
+    fill_buffer_randn(B_s8.data(), B_s8.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(B_scale.data(), B_scale.size(), 0.001f, 0.005f);
+    auto srcptr = (utils::int4x2*)B_s8.data();
+    for (int i = 0; i < k; i++) {
+      for (int j = 0; j < n; j += 2) {
+        auto tmp = srcptr[i * n / 2 + j / 2];
+        auto noffset = i / blocksize * n + j;
+        matB[i * n + j + 0] = static_cast<float>(static_cast<int8_t>(tmp.x) << 4) * B_scale[noffset + 0];
+        matB[i * n + j + 1] = static_cast<float>(static_cast<int8_t>(tmp.y) << 4) * B_scale[noffset + 1];
+      }
+    }
+    gemmref_fp32fp32fp32(m, n, k, matA.data(), matB.data(), ref.data(), k, n, n);
+    sycl_vector<float> dA(matA.size(), q), dB(matB.size(), q), dC(matC.size(), q), dB_scale(B_scale.size(), q);
+    sycl_vector<uint8_t> dBs8(B_s8.size(), q);
+    q->memcpy(dA.data(), matA.data(), matA.size() * 4).wait();
+    q->memcpy(dBs8.data(), B_s8.data(), B_s8.size() * 1).wait();
+    q->memcpy(dB_scale.data(), B_scale.data(), B_scale.size() * 4).wait();
+    sycl::range<2> group{SGemm_t::WgM, SGemm_t::WgN};
+    sycl::range<2> problem{m / SGemm_t::TileM, n / SGemm_t::TileN};
+    auto A_d = dA.data();
+    auto Bs8_d = dBs8.data();
+    auto B_scale_d = dB_scale.data();
+    auto C_d = dC.data();
+    utils::GemmProblem gp(1, m, n, k);
+    auto e_esimd = KernelLauncher::compute(q, m, n, k, blocksize, {{A_d, k}, {Bs8_d, B_scale_d, n}, {C_d, n}});
+    e_esimd.wait();
+    q->memcpy(matC.data(), C_d, matC.size() * 4).wait();
+    buffer_error(ref.data(), matC.data(), ref.size(), 0.001f);
+  }
+
+  void utT(int m, int n, int k, int blocksize) {
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    printf("Test Case: %d %d %d Device:%s\n", m, n, k, dev->getName().c_str());
+    avector<float> matA(m * k), matB(k * n), matC(m * n), ref(m * n);
+    fill_buffer_randn(matA.data(), matA.size(), -0.5f, 0.5f);
+    int blks = k / blocksize;
+    avector<float> B_scale(size_t(blks) * n);
+    avector<uint8_t> B_s8(k * n / 2);
+    fill_buffer_randn(B_s8.data(), B_s8.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(B_scale.data(), B_scale.size(), 0.001f, 0.005f);
+    auto srcptr = (utils::int4x2*)B_s8.data();
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < k; j += 2) {
+        auto tmp = srcptr[i * k / 2 + j / 2];
+        auto noffset = i * blks + j / blocksize;
+        matB[i * k + j + 0] = static_cast<float>(static_cast<int8_t>(tmp.x) << 4) * B_scale[noffset];
+        matB[i * k + j + 1] = static_cast<float>(static_cast<int8_t>(tmp.y) << 4) * B_scale[noffset];
+      }
+    }
+    avector<float> matBNT(k * n);
+    kernel::wrapper::Transpose2D<float>::forward<BTLA_ISA::NoSIMD>(matB.data(), matBNT.data(), n, k, k, n);
+    gemmref_fp32fp32fp32(m, n, k, matA.data(), matBNT.data(), ref.data(), k, n, n);
+    sycl_vector<float> dA(matA.size(), q), dB(matB.size(), q), dC(matC.size(), q), dB_scale(B_scale.size(), q);
+    sycl_vector<uint8_t> dBs8(B_s8.size(), q);
+    q->memcpy(dA.data(), matA.data(), matA.size() * 4).wait();
+    q->memcpy(dBs8.data(), B_s8.data(), B_s8.size() * 1).wait();
+    q->memcpy(dB_scale.data(), B_scale.data(), B_scale.size() * 4).wait();
+    sycl::range<2> group{SGemm_t::WgM, SGemm_t::WgN};
+    sycl::range<2> problem{m / SGemm_t::TileM, n / SGemm_t::TileN};
+    auto A_d = dA.data();
+    auto Bs8_d = dBs8.data();
+    auto B_scale_d = dB_scale.data();
+    auto C_d = dC.data();
+    utils::GemmProblem gp(1, m, n, k);
+    auto e_esimd = KernelTLauncher::compute(q, m, n, k, blocksize, {{A_d, k}, {Bs8_d, B_scale_d, blks}, {C_d, n}});
+    e_esimd.wait();
+    q->memcpy(matC.data(), C_d, matC.size() * 4).wait();
+    buffer_error(ref.data(), matC.data(), ref.size(), 0.001f);
+  }
+};
+#ifdef BTLA_UT_SYCL
+static UT_SyclS4SGemm sUT_SyclS4SGemm;
+#endif
+
+class UT_SyclS4HGemm {
+ public:
+  UT_SyclS4HGemm() {
+    UT_START();
+    ut(300, 1024, 1024, 32);
+    ut(1024, 1024, 1024, 32);
+    utT(1024, 1024, 1024, 32);
+  }
+  using GemmT = xve::DefaultHGemmCore;
+  template <class GCT>
+  using ProAT = sycl_prologue_a::ActivationBase<GCT, sycl::half>;
+  template <class GCT>
+  using ProBT = sycl_prologue_b::WeightS4<GCT, sycl::half>;
+  template <class GCT>
+  using ProBTransT = sycl_prologue_b::WeightS4Trans<GCT, sycl::half>;
+  template <class GCT>
+  using EpiT = sycl_epilogue::OutputBase<GCT, sycl::half>;
+  using KernelLauncher = sycl_wrapper::LauncherWOQ<ProAT, ProBT, EpiT, GemmT>;
+  using KernelTLauncher = sycl_wrapper::LauncherWOQ<ProAT, ProBTransT, EpiT, GemmT>;
+
+  void ut(int m, int n, int k, int blocksize) {
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    printf("Test Case: %d %d %d Device:%s\n", m, n, k, dev->getName().c_str());
+    avector<utils::fp16> matA(m * k), matB(k * n), matC(m * n), ref(m * n);
+    fill_buffer_randn(matA.data(), matA.size(), utils::fp16(-0.5f), utils::fp16(0.5f));
+    int blks = k / blocksize;
+    avector<utils::fp16> B_scale(size_t(blks) * n);
+    avector<uint8_t> B_s8(k * n / 2);
+    fill_buffer_randn(B_s8.data(), B_s8.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(B_scale.data(), B_scale.size(), utils::fp16(0.001f), utils::fp16(0.005f));
+    auto srcptr = (utils::int4x2*)B_s8.data();
+    for (int i = 0; i < k; i++) {
+      for (int j = 0; j < n; j += 2) {
+        auto tmp = srcptr[i * n / 2 + j / 2];
+        auto noffset = i / blocksize * n + j;
+        matB[i * n + j + 0] = static_cast<float>(static_cast<int8_t>(tmp.x) << 4) * float(B_scale[noffset + 0]);
+        matB[i * n + j + 1] = static_cast<float>(static_cast<int8_t>(tmp.y) << 4) * float(B_scale[noffset + 1]);
+      }
+    }
+    gemmref_fp16fp16fp16(m, n, k, matA.data(), matB.data(), ref.data(), k, n, n);
+    sycl_vector<sycl::half> dA(matA.size(), q), dB(matB.size(), q), dC(matC.size(), q), dB_scale(B_scale.size(), q);
+    sycl_vector<uint8_t> dBs8(B_s8.size(), q);
+    q->memcpy(dA.data(), matA.data(), matA.size() * 2).wait();
+    q->memcpy(dBs8.data(), B_s8.data(), B_s8.size() * 1).wait();
+    q->memcpy(dB_scale.data(), B_scale.data(), B_scale.size() * 2).wait();
+    auto A_d = dA.data();
+    auto Bs8_d = dBs8.data();
+    auto B_scale_d = dB_scale.data();
+    auto C_d = dC.data();
+    auto e_esimd = KernelLauncher::compute(q, m, n, k, blocksize, {{A_d, k}, {Bs8_d, B_scale_d, n}, {C_d, n}});
+    e_esimd.wait();
+    q->memcpy(matC.data(), C_d, matC.size() * 2).wait();
+    buffer_error(ref.data(), matC.data(), ref.size(), utils::fp16(0.2f));
+  }
+
+  void utT(int m, int n, int k, int blocksize) {
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    printf("Test Case: %d %d %d Device:%s\n", m, n, k, dev->getName().c_str());
+    avector<utils::fp16> matA(m * k), matB(k * n), matC(m * n), ref(m * n);
+    fill_buffer_randn(matA.data(), matA.size(), utils::fp16(-0.5f), utils::fp16(0.5f));
+    int blks = k / blocksize;
+    avector<utils::fp16> B_scale(size_t(blks) * n);
+    avector<uint8_t> B_s8(k * n / 2);
+    fill_buffer_randn(B_s8.data(), B_s8.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(B_scale.data(), B_scale.size(), utils::fp16(0.001f), utils::fp16(0.005f));
+    auto srcptr = (utils::int4x2*)B_s8.data();
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < k; j += 2) {
+        auto tmp = srcptr[i * k / 2 + j / 2];
+        auto noffset = i * blks + j / blocksize;
+        matB[i * k + j + 0] = static_cast<float>(static_cast<int8_t>(tmp.x) << 4) * float(B_scale[noffset]);
+        matB[i * k + j + 1] = static_cast<float>(static_cast<int8_t>(tmp.y) << 4) * float(B_scale[noffset]);
+      }
+    }
+    avector<utils::fp16> matBNT(k * n);
+    kernel::wrapper::Transpose2D<utils::fp16>::forward<BTLA_ISA::NoSIMD>(matB.data(), matBNT.data(), n, k, k, n);
+    gemmref_fp16fp16fp16(m, n, k, matA.data(), matBNT.data(), ref.data(), k, n, n);
+    sycl_vector<sycl::half> dA(matA.size(), q), dC(matC.size(), q), dB_scale(B_scale.size(), q);
+    sycl_vector<uint8_t> dBs8(B_s8.size(), q);
+    q->memcpy(dA.data(), matA.data(), matA.size() * 2).wait();
+    q->memcpy(dBs8.data(), B_s8.data(), B_s8.size() * 1).wait();
+    q->memcpy(dB_scale.data(), B_scale.data(), B_scale.size() * 2).wait();
+    auto A_d = dA.data();
+    auto Bs8_d = dBs8.data();
+    auto B_scale_d = dB_scale.data();
+    auto C_d = dC.data();
+    auto e_esimd = KernelTLauncher::compute(q, m, n, k, blocksize, {{A_d, k}, {Bs8_d, B_scale_d, blks}, {C_d, n}});
+    e_esimd.wait();
+    q->memcpy(matC.data(), C_d, matC.size() * 2).wait();
+    buffer_error(ref.data(), matC.data(), ref.size(), utils::fp16(0.2f));
+  }
+};
+#ifdef BTLA_UT_SYCL
+static UT_SyclS4HGemm sUT_SyclS4HGemm;
+#endif
+
+class UT_SyclInt4Dequant {
+ public:
+  UT_SyclInt4Dequant() {
+    UT_START();
+    ut_fp32(1024, 1024, 32);
+    ut_fp32_T(1024, 1024, 32);
+  }
+
+  void ut_fp32(int n, int k, int blocksize) {
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    printf("Test Case: %d %d %d Device:%s\n", n, k, blocksize, dev->getName().c_str());
+    avector<uint8_t> rawB(k * n / 2);
+    int blks = updiv(k, blocksize);
+    avector<float> scale(blks * n), dequant(n * k), ref(n * k);
+    fill_buffer_randn(scale.data(), scale.size(), 0.01f, 0.03f);
+    fill_buffer_randn(rawB.data(), rawB.size(), uint8_t(0), uint8_t(255));
+    auto srcptr = (utils::int4x2*)rawB.data();
+    for (int i = 0; i < k; i++) {
+      for (int j = 0; j < n; j += 2) {
+        auto tmp = srcptr[i * n / 2 + j / 2];
+        auto noffset = i / blocksize * n + j;
+        ref[i * n + j + 0] = static_cast<float>(static_cast<int8_t>(tmp.x) << 4) * scale[noffset + 0];
+        ref[i * n + j + 1] = static_cast<float>(static_cast<int8_t>(tmp.y) << 4) * scale[noffset + 1];
+      }
+    }
+    using ProB = sycl_prologue_b::WeightS4<sycl_gemm::xve::DefaultSGemmCore, float>;
+    sycl_vector<float> dS(scale.size(), q), dequantB(n * k, q);
+    sycl_vector<uint8_t> dB(rawB.size(), q);
+    q->memcpy(dS.data(), scale.data(), scale.size() * 4).wait();
+    q->memcpy(dB.data(), rawB.data(), rawB.size() * 1).wait();
+    auto S_d = dS.data();
+    auto B_d = dB.data();
+    auto DB_d = dequantB.data();
+    auto e_esimd = ProB::dequant_s4<sycl_prologue_b::KernelConfigBase>(n, k, blocksize, {B_d, S_d, n}, DB_d, q);
+    e_esimd.wait();
+    q->memcpy(dequant.data(), DB_d, dequant.size() * 4).wait();
+    buffer_error(ref.data(), dequant.data(), dequant.size(), 0.001f);
+  }
+
+  void ut_fp32_T(int n, int k, int blocksize) {
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    printf("Test Case: %d %d %d Device:%s\n", n, k, blocksize, dev->getName().c_str());
+    avector<uint8_t> rawB(k * n / 2);
+    int blks = updiv(k, blocksize);
+    avector<float> scale(blks * n), dequant(n * k), ref(n * k);
+    fill_buffer_randn(scale.data(), scale.size(), 0.01f, 0.03f);
+    fill_buffer_randn(rawB.data(), rawB.size(), uint8_t(0), uint8_t(255));
+    auto srcptr = (utils::int4x2*)rawB.data();
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < k; j += 2) {
+        auto tmp = srcptr[i * k / 2 + j / 2];
+        auto noffset = i * blks + j / blocksize;
+        ref[i * k + j + 0] = static_cast<float>(static_cast<int8_t>(tmp.x) << 4) * scale[noffset];
+        ref[i * k + j + 1] = static_cast<float>(static_cast<int8_t>(tmp.y) << 4) * scale[noffset];
+      }
+    }
+    using ProB = sycl_prologue_b::WeightS4Trans<sycl_gemm::xve::DefaultSGemmCore, float>;
+    sycl_vector<float> dS(scale.size(), q), dequantB(n * k, q);
+    sycl_vector<uint8_t> dB(rawB.size(), q);
+    q->memcpy(dS.data(), scale.data(), scale.size() * 4).wait();
+    q->memcpy(dB.data(), rawB.data(), rawB.size() * 1).wait();
+    auto S_d = dS.data();
+    auto B_d = dB.data();
+    auto DB_d = dequantB.data();
+    auto e_esimd = ProB::dequant_s4<sycl_prologue_b::KernelConfigTrans>(n, k, blocksize, {B_d, S_d, blks}, DB_d, q);
+    e_esimd.wait();
+    q->memcpy(dequant.data(), DB_d, dequant.size() * 4).wait();
+    buffer_error(ref.data(), dequant.data(), dequant.size(), 0.001f);
+
+    avector<float> refNT(k * n);
+    kernel::wrapper::Transpose2D<float>::forward<BTLA_ISA::NoSIMD>(ref.data(), refNT.data(), n, k, k, n);
+    e_esimd = ProB::dequant_s4_trans<sycl_prologue_b::KernelConfigTrans>(n, k, blocksize, {B_d, S_d, blks}, DB_d, q);
+    e_esimd.wait();
+    q->memcpy(dequant.data(), DB_d, dequant.size() * 4).wait();
+    buffer_error(refNT.data(), dequant.data(), dequant.size(), 0.001f);
+  }
+};
+#ifdef BTLA_UT_SYCL
+static UT_SyclInt4Dequant sUT_SyclInt4Dequant;
+#endif
+
+class UT_SyclS4Gemv {
+ public:
+  UT_SyclS4Gemv() {
+    UT_START();
+    ut_T(1024, 1024, 32);
+    ut_half(1024, 1024, 32);
+  }
+  using SGemm_t = xve::DefaultSGemmCore;
+  template <class GCT>
+  using ProAT = sycl_prologue_a::ActivationBase<GCT, float>;
+  template <class GCT>
+  using ProBT = sycl_prologue_b::WeightBase<GCT, float>;
+  template <class GCT>
+  using ProBTransT = sycl_prologue_b::WeightS4Trans<GCT, float>;
+  template <class GCT>
+  using EpiT = sycl_epilogue::OutputBase<GCT, float>;
+  using KernelLauncher = sycl_wrapper::Launcher<ProAT, ProBT, EpiT, SGemm_t>;
+
+  void ut_T(int n, int k, int blocksize) {
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    printf("Test Case %s: %d %d %d Device:%s\n", __FUNCTION__, n, k, blocksize, dev->getName().c_str());
+    avector<uint8_t> rawB(k * n / 2);
+    int blks = updiv(k, blocksize);
+    avector<float> scale(blks * n), C(n), dqB(n * k), A(k), refC(n);
+    fill_buffer_randn(scale.data(), scale.size(), 0.01f, 0.03f);
+    fill_buffer_randn(A.data(), A.size(), -0.1f, 0.3f);
+    fill_buffer_randn(rawB.data(), rawB.size(), uint8_t(0), uint8_t(255));
+    auto srcptr = (utils::int4x2*)rawB.data();
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < k; j += 2) {
+        auto tmp = srcptr[i * k / 2 + j / 2];
+        auto noffset = i * blks + j / blocksize;
+        dqB[i + (j + 0) * n] = static_cast<float>(static_cast<int8_t>(tmp.x) << 4) * scale[noffset];
+        dqB[i + (j + 1) * n] = static_cast<float>(static_cast<int8_t>(tmp.y) << 4) * scale[noffset];
+      }
+    }
+    gemmref_fp32fp32fp32(1, n, k, A.data(), dqB.data(), refC.data(), k, n, n);
+    sycl_vector<float> dS(scale.size(), q), dC(n, q), dA(k, q);
+    sycl_vector<uint8_t> dB(rawB.size(), q);
+    q->memcpy(dB.data(), rawB.data(), rawB.size() * 1).wait();
+    q->memcpy(dS.data(), scale.data(), scale.size() * 4).wait();
+    q->memcpy(dA.data(), A.data(), A.size() * 4).wait();
+    int constexpr SgSize = 16;
+    int constexpr TileK = 2;
+    int constexpr GroupK = SgSize * TileK;
+    sycl::range<1> group{SgSize};
+    sycl::range<1> problem{n * SgSize};
+    auto S_d = dS.data();
+    auto A_d = dA.data();
+    auto B_d = dB.data();
+    auto C_d = dC.data();
+    auto e_esimd = ProBTransT<SGemm_t>::gemv(A_d, {B_d, S_d, blks}, C_d, n, k, blocksize, q);
+    e_esimd.wait();
+    q->memcpy(C.data(), C_d, C.size() * 4).wait();
+    buffer_error(refC.data(), C.data(), C.size(), 0.001f);
+  }
+
+  void ut_half(int n, int k, int blocksize) {
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    printf("Test Case %s: %d %d %d Device:%s\n", __FUNCTION__, n, k, blocksize, dev->getName().c_str());
+    avector<uint8_t> rawB(k * n / 2);
+    int blks = updiv(k, blocksize);
+    avector<utils::fp16> scale(blks * n), C(n), dqB(n * k), A(k), refC(n);
+    fill_buffer_randn(scale.data(), scale.size(), utils::fp16(0.01f), utils::fp16(0.03f));
+    fill_buffer_randn(A.data(), A.size(), utils::fp16(-0.1f), utils::fp16(0.3f));
+    fill_buffer_randn(rawB.data(), rawB.size(), uint8_t(0), uint8_t(255));
+    auto srcptr = (utils::int4x2*)rawB.data();
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < k; j += 2) {
+        auto tmp = srcptr[i * k / 2 + j / 2];
+        auto noffset = i * blks + j / blocksize;
+        float fscale = float(scale[noffset]);
+        dqB[i + (j + 0) * n] = static_cast<float>(static_cast<int8_t>(tmp.x) << 4) * fscale;
+        dqB[i + (j + 1) * n] = static_cast<float>(static_cast<int8_t>(tmp.y) << 4) * fscale;
+      }
+    }
+    gemmref_fp16fp16fp16(1, n, k, A.data(), dqB.data(), refC.data(), k, n, n);
+    sycl_vector<sycl::half> dS(scale.size(), q), dC(n, q), dA(k, q);
+    sycl_vector<uint8_t> dB(rawB.size(), q);
+    q->memcpy(dB.data(), rawB.data(), rawB.size() * 1).wait();
+    q->memcpy(dS.data(), scale.data(), scale.size() * 2).wait();
+    q->memcpy(dA.data(), A.data(), A.size() * 2).wait();
+    int constexpr SgSize = 16;
+    int constexpr TileK = 32;
+    int constexpr GroupK = SgSize * TileK;
+    sycl::range<1> group{SgSize};
+    sycl::range<1> problem{n * SgSize};
+    auto S_d = dS.data();
+    auto A_d = dA.data();
+    auto B_d = dB.data();
+    auto C_d = dC.data();
+    auto e_esimd = sycl_prologue_b::WeightS4Trans<xve::DefaultHGemmCore, sycl::half>::gemv(A_d, {B_d, S_d, blks}, C_d,
+                                                                                           n, k, blocksize, q);
+    e_esimd.wait();
+    q->memcpy(C.data(), C_d, C.size() * 2).wait();
+    buffer_error(refC.data(), C.data(), C.size(), utils::fp16(0.1f));
+  }
+};
+#ifdef BTLA_UT_SYCL
+static UT_SyclS4Gemv sUT_SyclS4Gemv;
+#endif
+}  // namespace sycl_ut
+}  // namespace bestla
diff --git a/bestla/bestla/ut/sycl_misc.cpp b/bestla/bestla/ut/sycl_misc.cpp
new file mode 100644
index 000000000..e81521959
--- /dev/null
+++ b/bestla/bestla/ut/sycl_misc.cpp
@@ -0,0 +1,34 @@
+#include "bestla_ut.h"
+#include "sycl_ut.h"
+#include "../sycl/sycl_device.h"
+#include "../sycl/sycl_utils.h"
+namespace bestla {
+using namespace utils;
+namespace sycl_ut {
+class UT_SyclDevice {
+ public:
+  UT_SyclDevice() {
+    UT_START();
+    auto dev = UT_Device::get();
+    dev->print();
+  }
+};
+// static UT_SyclDevice sUT_SyclDevice;
+
+class UT_SyclVector {
+ public:
+  UT_SyclVector() {
+    UT_START();
+    auto dev = UT_Device::get();
+    auto q = dev->getQueue();
+    auto svec = sycl_utils::sycl_vector<float>(1000, q);
+    utils::avector<float> hsrc(1000);
+    ut::fill_buffer_randn(hsrc.data(), hsrc.size(), -0.5f, 0.5f);
+    q->memcpy(svec.data(), hsrc.data(), hsrc.size() * 4).wait();
+    auto hdst = sycl_utils::sycl2host(svec.data(), svec.size(), q);
+    ut::buffer_error(hsrc.data(), hdst.data(), hsrc.size(), 0.f);
+  }
+};
+// static UT_SyclVector sUT_SyclVector;
+}  // namespace sycl_ut
+}  // namespace bestla
diff --git a/bestla/bestla/ut/sycl_ut.h b/bestla/bestla/ut/sycl_ut.h
new file mode 100644
index 000000000..b14b4d449
--- /dev/null
+++ b/bestla/bestla/ut/sycl_ut.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "sycl/sycl_device.h"
+
+namespace bestla {
+namespace sycl_ut {
+
+class UT_Device {
+ public:
+  static bestla::sycl_device::SyclDevice* get() {
+    static bestla::sycl_device::SyclDevice Instance(true);
+    return &Instance;
+  }
+};
+};  // namespace sycl_ut
+}  // namespace bestla
diff --git a/bestla/cmake/sycl.cmake b/bestla/cmake/sycl.cmake
new file mode 100644
index 000000000..0232d871d
--- /dev/null
+++ b/bestla/cmake/sycl.cmake
@@ -0,0 +1,3 @@
+cmake_minimum_required(VERSION 3.23)
+
+find_package(IntelSYCL REQUIRED)
diff --git a/neural_speed/core/layers/mha_dense.cpp b/neural_speed/core/layers/mha_dense.cpp
index 9ad176729..6a03050c6 100644
--- a/neural_speed/core/layers/mha_dense.cpp
+++ b/neural_speed/core/layers/mha_dense.cpp
@@ -235,6 +235,7 @@ void bestla_reordered_attn_fp32_update_k_24x1(const bestla_fusion_attn_fp32_upda
 
     if (p.seq_off == 0 && p.head_size % 8 == 0 && zero_padding) {
       int i = 0;
+#if CompileAVX2()
       for (; i < padto_le(p.seq_size, 8); i += 8) {  // QK_GEMM should not require 0-padding on seq_kv (i.e. N-dim)
         for (int j = 0; j < pad_headsize; j += 8) {  // K-dim padding for QK_GEMM
           const auto i_dst = p.seq_off + i;
@@ -256,6 +257,7 @@ void bestla_reordered_attn_fp32_update_k_24x1(const bestla_fusion_attn_fp32_upda
             _mm_store_si128(reinterpret_cast<__m128i*>(dst + i_blk * pad_headsize + ii + (j + jj) * 24), mm_dst[jj]);
         }
       }
+#endif
     } else {
       for (int i = 0; i < p.seq_size; ++i) {      // QK_GEMM should not require 0-padding on seq_kv (i.e. N-dim)
         for (int j = 0; j < pad_headsize; ++j) {  // K-dim padding for QK_GEMM
diff --git a/neural_speed/core/layers/mha_dense_wrapper.h b/neural_speed/core/layers/mha_dense_wrapper.h
index 84c8f9983..8436b7ae0 100644
--- a/neural_speed/core/layers/mha_dense_wrapper.h
+++ b/neural_speed/core/layers/mha_dense_wrapper.h
@@ -1479,6 +1479,7 @@ struct inplace_precompute_max_softmax_t<std::enable_if_t<(ISA_T < BTLA_ISA::AVX5
   }
 };
 #endif
+#if CompileAVX512F()
 template <BTLA_ISA ISA_T>
 struct inplace_precompute_max_softmax_t<float, uint8_t, ISA_T> {
   TARGET_512 static void forward(int m_size, int n_size, int n_pad_size, bool is_causal, float* src, uint8_t* dst,
@@ -1529,7 +1530,7 @@ struct inplace_precompute_max_softmax_t<float, uint8_t, ISA_T> {
     }
   }
 };
-
+#endif
 /**
  * @brief MHA interface with N-dim parallelism & stable softmax
  *
@@ -1821,6 +1822,7 @@ inline void bestla_fusion_attn_forward<float, fp16, fp16, float>(
   GetCPUDevice();
   const auto pth = ne_threading::get();
   if (MHA_PREFER_AVX512FP16 && _cd->AVX512_FP16() && params.step_k_sl == 1) {
+#if CompileAVX512F()
     using GemmKernelFP16TrackMax = mha::launcher_base_weight_t<  //
         BTLA_ISA::AVX512_FP16,                                   //
         gemm::HCoreRowNAvx512fp16<64, 8>,                        //
@@ -1836,9 +1838,13 @@ inline void bestla_fusion_attn_forward<float, fp16, fp16, float>(
     static mha_stable_interface_t<GemmKernelFP16TrackMax, GemmKernelFP16> kernel;
     [[maybe_unused]] const auto ret = kernel.compute(params, *pth);
     assert(ret == BTLA_CODE::Success);
+#else
+    assert(false);
+#endif
   } else if (_cd->AMX_BF16() &&                           //
              params.K_layout == ATTN_FWD_LAYOUT_PLAIN &&  //
              params.V_layout == ATTN_FWD_LAYOUT_PLAIN) {
+#if CompileBF16()
     if (params.step_k_head_size == 1) {
       using GemmKernelFP32FP16BF16ExpSum = mha::launcher_base_off_t<  //
           BTLA_ISA::AMX_BF16,                                         //
@@ -1872,9 +1878,13 @@ inline void bestla_fusion_attn_forward<float, fp16, fp16, float>(
       [[maybe_unused]] const auto ret = kernel.compute(params, *pth);
       assert(ret == BTLA_CODE::Success);
     }
+#else
+    assert(false);
+#endif
   } else if (_cd->AVX2() &&  //
              params.K_layout == ATTN_FWD_LAYOUT_NTILE24_ROWPACK1 &&
              params.V_layout == ATTN_FWD_LAYOUT_NTILE24_ROWPACK1) {
+#if CompileAVX2()
     using GemmKernelTrackMax = mha::launcher_base_weight_t<  //
         BTLA_ISA::AVX2,                                      //
         gemm::SCoreRowNAvx2<24, 4>,                          //
@@ -1890,6 +1900,9 @@ inline void bestla_fusion_attn_forward<float, fp16, fp16, float>(
     static mha_stable_interface_t<GemmKernelTrackMax, GemmKernelId> mha;
     [[maybe_unused]] const auto ret = mha.compute(params, *pth);
     assert(ret == BTLA_CODE::Success);
+#else
+    assert(false);
+#endif
   } else {
     assert(false);  // no suitbale launcher
   }
@@ -1970,6 +1983,7 @@ inline void bestla_fusion_attn_forward<float, bf16, bf16, float>(
   const auto pth = ne_threading::get();
   if (_cd->AVX512F() &&
       ((_cd->AMX_BF16() && (params.attn_flags & NE_ATTN_FLAG_PREFER_FP32) != 0) || !_cd->AMX_BF16())) {
+#if CompileAVX512F()
     using GemmKernelBF16TrackMax = mha::launcher_base_weight_t<  //
         BTLA_ISA::AVX512F,                                       //
         gemm::SCoreRowNAvx512f<48, 8>,                           //
@@ -1985,7 +1999,11 @@ inline void bestla_fusion_attn_forward<float, bf16, bf16, float>(
     static mha_stable_interface_t<GemmKernelBF16TrackMax, GemmKernelBF16> mha;
     [[maybe_unused]] const auto ret = mha.compute(params, *pth);
     assert(ret == BTLA_CODE::Success);
-  } else if (/* params.sl_q > 4 &&  */ _cd->AMX_BF16()) {        // TODO(Yi): add vdpbf16ps impl
+#else
+    assert(false);
+#endif
+  } else if (/* params.sl_q > 4 &&  */ _cd->AMX_BF16()) {  // TODO(Yi): add vdpbf16ps impl
+#if CompileBF16()
     using GemmKernelBF16TrackMax = mha::launcher_base_weight_t<  //
         BTLA_ISA::AMX_BF16,                                      //
         gemm::HCoreRowNAmxbf16<48, 16>,                          //
@@ -2001,6 +2019,9 @@ inline void bestla_fusion_attn_forward<float, bf16, bf16, float>(
     static mha_stable_interface_t<GemmKernelBF16TrackMax, GemmKernelBF16> mha;
     [[maybe_unused]] const auto ret = mha.compute(params, *pth);
     assert(ret == BTLA_CODE::Success);
+#else
+    assert(false);
+#endif
   } else {
     assert(0);
   }