diff --git a/CMakeLists.txt b/CMakeLists.txt
index c341d83c7..3ace8bf36 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -96,6 +96,7 @@ endif()
 
 if (MSVC)
     add_compile_definitions(_CRT_SECURE_NO_WARNINGS NOMINMAX)
+    add_compile_options(/bigobj)
     if (BUILD_SHARED_LIBS)
         set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
     endif()
diff --git a/bestla/CMakeLists.txt b/bestla/CMakeLists.txt
index 9c44e2fcc..f6d58049a 100644
--- a/bestla/CMakeLists.txt
+++ b/bestla/CMakeLists.txt
@@ -108,6 +108,8 @@ if(UT_BUILD)
 		  target_link_options(${PROJECT_NAME}_ut PRIVATE -fsanitize=address)
 		endif()
 		target_link_options(${PROJECT_NAME}_ut PRIVATE -lpthread)
+  else()
+    target_link_options(${PROJECT_NAME}_ut PUBLIC /STACK:5242880)
 	endif()
 
 	add_ut_flag(BTLA_UT_DEBUG)
@@ -137,6 +139,8 @@ if(BTLA_UT_BENCHMARK)
   endif()
   if(NOT WIN32)
 		target_link_options(${PROJECT_NAME}_benchmark PRIVATE -lpthread)
+  else()
+    target_link_options(${PROJECT_NAME}_benchmark PUBLIC /STACK:5242880)
 	endif()
   target_link_libraries(${PROJECT_NAME}_benchmark PRIVATE ${PROJECT_NAME} ${sycl_libs})
 endif(BTLA_UT_BENCHMARK)
diff --git a/bestla/bestla/bestla_device.h b/bestla/bestla/bestla_device.h
index aaa1c3b28..d7c1f2fbb 100644
--- a/bestla/bestla/bestla_device.h
+++ b/bestla/bestla/bestla_device.h
@@ -20,12 +20,13 @@
 #include "bestla_utils.h"
 #ifdef _WIN32
 #include <windows.h>
+#define FIXED_CACHE 1
 #else
 #include <sched.h>
+#define FIXED_CACHE 0
 #endif
 
 #define FIXED_CACHE_SIZE ((1 << 20) - (128 << 10))
-#define FIXED_CACHE 1
 
 namespace bestla {
 
diff --git a/bestla/bestla/bestla_epilogue.h b/bestla/bestla/bestla_epilogue.h
index 3360688f5..11fa5db99 100644
--- a/bestla/bestla/bestla_epilogue.h
+++ b/bestla/bestla/bestla_epilogue.h
@@ -23,6 +23,86 @@ namespace bestla {
 namespace epilogue {
 namespace gemm {
 
+struct ParamPcKBlockCompInt8Epilogue {
+  void* scalesB;
+  BTLA_DTYPE scaleBdtype;
+  float* scalesA;
+  // optional if A asym
+  uint8_t* zpA = nullptr;
+  void* reduceB = nullptr;
+  BTLA_DTYPE reduceBdtype = BTLA_DTYPE::F32;
+  // optional if B asym
+  int8_t* zpB = nullptr;
+  float* reduceA = nullptr;
+  int K = 1;
+};
+template <class Fp32Epilogue>
+class PcKBlockCompInt8Epilogue {
+ public:
+  using Fp32Param = typename Fp32Epilogue::Param;
+  struct Param {
+    ParamPcKBlockCompInt8Epilogue param1;
+    Fp32Param param2;
+  };
+  using Fp32Epi = Fp32Epilogue;
+  template <BTLA_ISA ISA_T>
+  static BTLA_CODE forward(const int32_t* srcptr, const int cachestep, const int M_offset, const int N_offset,
+                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+    BTLA_CODE ret = BTLA_CODE::NotSupport;
+    float* scab = nullptr;
+    size_t ScaleBTmpSize = N * sizeof(float);
+    size_t ReduceBTmpSize = N * sizeof(float);
+    assert(cachesize >= (ScaleBTmpSize + ReduceBTmpSize));
+    auto& param1 = _param.param1;
+    if (param1.scaleBdtype == BTLA_DTYPE::BF16) {
+      auto scache = reinterpret_cast<float*>(tmpcache);
+      ret = kernel::wrapper::Memcpy2DBf16CvtFp32::template forward<ISA_T>(
+          reinterpret_cast<utils::bf16*>(param1.scalesB) + N_offset, scache, 1, N, N, N, false);
+      assert(ret == BTLA_CODE::Success);
+      scab = scache;
+    } else if (param1.scaleBdtype == BTLA_DTYPE::F32) {
+      scab = reinterpret_cast<float*>(param1.scalesB) + N_offset;
+    }
+    float* redb = nullptr;
+    if (param1.reduceB) {
+      if (param1.reduceBdtype == BTLA_DTYPE::BF16) {
+        auto rcache = reinterpret_cast<float*>(reinterpret_cast<char*>(tmpcache) + ScaleBTmpSize);
+        ret = kernel::wrapper::Memcpy2DBf16CvtFp32::template forward<ISA_T>(
+            reinterpret_cast<utils::bf16*>(param1.reduceB) + N_offset, rcache, 1, N, N, N, false);
+        assert(ret == BTLA_CODE::Success);
+        redb = rcache;
+      } else if (param1.reduceBdtype == BTLA_DTYPE::F32) {
+        redb = reinterpret_cast<float*>(param1.reduceB) + N_offset;
+      }
+    }
+    auto tmpfp32ptr = reinterpret_cast<float*>(const_cast<int32_t*>(srcptr));
+    ret = kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(srcptr, cachestep, tmpfp32ptr, cachestep, M, N,
+                                                                  param1.scalesA + M_offset, 1, scab);
+    assert(ret == BTLA_CODE::Success);
+
+    if (param1.zpA == nullptr) {
+      if (param1.zpB == nullptr) {
+      } else {
+        ret = kernel::wrapper::RemoveZeroPointBias::template forward_wei<ISA_T>(
+            tmpfp32ptr, cachestep, M, N, param1.zpB + N_offset, scab, 1, param1.reduceA + M_offset);
+      }
+    } else {
+      if (param1.zpB == nullptr) {
+        ret = kernel::wrapper::RemoveZeroPointBias::template forward_act<ISA_T>(
+            tmpfp32ptr, cachestep, M, N, param1.zpA + M_offset, param1.scalesA + M_offset, 1, redb);
+      } else {
+        ret = kernel::wrapper::RemoveZeroPointBias::template forward_both<ISA_T>(
+            tmpfp32ptr, cachestep, M, N, param1.zpA + M_offset, param1.zpB + N_offset, param1.scalesA + M_offset, scab,
+            1, param1.K, param1.reduceA + M_offset, redb);
+      }
+    }
+    Fp32Epilogue::template forward<ISA_T>(tmpfp32ptr, cachestep, M_offset, N_offset, M, N, _param.param2, tmpcache,
+                                          cachesize);
+
+    return ret;
+  }
+};
+
 template <typename DT>
 struct ParamAccumulatorWriteBack {
   DT* C;
@@ -30,13 +110,15 @@ struct ParamAccumulatorWriteBack {
   void* elt_const_v;
 };
 
-template <BTLA_ISA ISA_T, typename _SRC_T, typename _DST_T>
+template <typename _SRC_T, typename _DST_T>
 class AccumulatorWriteBack {
  public:
   using SType = _SRC_T;
   using DType = _DST_T;
   using Param = ParamAccumulatorWriteBack<DType>;
+  using PcCompInt8Epi = bestla::epilogue::gemm::PcKBlockCompInt8Epilogue<AccumulatorWriteBack<_SRC_T, _DST_T>>;
 
+  template <BTLA_ISA ISA_T>
   static BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset,
                            const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
@@ -52,10 +134,13 @@ class AccumulatorWriteBack {
   }
 };
 
-template <BTLA_ISA ISA_T, typename _SRC_T, typename _DST_T, BTLA_ELTWISEOP _OP>
+template <typename _SRC_T, typename _DST_T, BTLA_ELTWISEOP _OP>
 class CustomAccumulatorWriteBackWithEltop {
  public:
+  using PcCompInt8Epi =
+      bestla::epilogue::gemm::PcKBlockCompInt8Epilogue<CustomAccumulatorWriteBackWithEltop<_SRC_T, _DST_T, _OP>>;
   using Param = ParamAccumulatorWriteBack<_DST_T>;
+  template <BTLA_ISA ISA_T>
   static BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset,
                            const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
@@ -68,27 +153,17 @@ class CustomAccumulatorWriteBackWithEltop {
     }
   }
 };
-template <BTLA_ISA ISA_T>
-using AccumulatorWriteBackFp32 = AccumulatorWriteBack<ISA_T, float, float>;
-template <BTLA_ISA ISA_T>
-using AccumulatorWriteBackInt32 = AccumulatorWriteBack<ISA_T, int, int>;
-template <BTLA_ISA ISA_T>
-using AccumulatorWriteBackBf16 = AccumulatorWriteBack<ISA_T, utils::bf16, utils::bf16>;
-template <BTLA_ISA ISA_T>
-using AccumulatorWriteBackFp16 = AccumulatorWriteBack<ISA_T, utils::fp16, utils::fp16>;
-template <BTLA_ISA ISA_T>
-using AccumulatorWriteBackBf16Fp32 = AccumulatorWriteBack<ISA_T, utils::bf16, float>;
-template <BTLA_ISA ISA_T>
-using AccumulatorWriteBackFp16Fp32 = AccumulatorWriteBack<ISA_T, utils::fp16, float>;
-template <BTLA_ISA ISA_T>
-using AccumulatorWriteBackFp32Bf16 = AccumulatorWriteBack<ISA_T, float, utils::bf16>;
+using AccumulatorWriteBackFp32 = AccumulatorWriteBack<float, float>;
+using AccumulatorWriteBackInt32 = AccumulatorWriteBack<int, int>;
+using AccumulatorWriteBackBf16 = AccumulatorWriteBack<utils::bf16, utils::bf16>;
+using AccumulatorWriteBackFp16 = AccumulatorWriteBack<utils::fp16, utils::fp16>;
+using AccumulatorWriteBackBf16Fp32 = AccumulatorWriteBack<utils::bf16, float>;
+using AccumulatorWriteBackFp16Fp32 = AccumulatorWriteBack<utils::fp16, float>;
+using AccumulatorWriteBackFp32Bf16 = AccumulatorWriteBack<float, utils::bf16>;
 
-template <BTLA_ISA ISA_T>
-using AccumulatorWriteBackWithGeluFp32 = CustomAccumulatorWriteBackWithEltop<ISA_T, float, float, BTLA_ELTWISEOP::GELU>;
+using AccumulatorWriteBackWithGeluFp32 = CustomAccumulatorWriteBackWithEltop<float, float, BTLA_ELTWISEOP::GELU>;
 
-template <BTLA_ISA ISA_T>
-using AccumulatorWriteBackWithSwishFp32 =
-    CustomAccumulatorWriteBackWithEltop<ISA_T, float, float, BTLA_ELTWISEOP::SWISH>;
+using AccumulatorWriteBackWithSwishFp32 = CustomAccumulatorWriteBackWithEltop<float, float, BTLA_ELTWISEOP::SWISH>;
 
 template <typename DT>
 struct ParamAlphaBetaProcess {
@@ -96,11 +171,11 @@ struct ParamAlphaBetaProcess {
   int ldc, ldd;
   float alpha, beta;
 };
-template <BTLA_ISA ISA_T>
 class AlphaBetaProcessFp32 {
  public:
   using Param = ParamAlphaBetaProcess<float>;
 
+  template <BTLA_ISA ISA_T>
   static BTLA_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset,
                            const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto DOffset = M_offset * _param.ldd + N_offset;
@@ -120,10 +195,10 @@ struct ParamCompFp32BlockEpilogue {
   float* reduce = nullptr;
   int ldra;
 };
-template <BTLA_ISA ISA_T>
 class CompFp32BlockEpilogue {
  public:
   using Param = ParamCompFp32BlockEpilogue;
+  template <BTLA_ISA ISA_T>
   static BTLA_CODE forward(const float* srcptr, float* dstptr, const int cachestep, const int M_offset,
                            const int N_offset, const int K_offset, const int M, const int N, const Param& _param,
                            void* tmpcache, size_t cachesize) {
@@ -171,10 +246,10 @@ struct ParamDequantInt32ToFp32 {
   float* scalesA;
   float* scalesB;
 };
-template <BTLA_ISA ISA_T>
 class DequantInt32ToFp32 {
  public:
   using Param = ParamDequantInt32ToFp32;
+  template <BTLA_ISA ISA_T>
   static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset,
                            const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
@@ -185,88 +260,6 @@ class DequantInt32ToFp32 {
   }
 };
 
-struct ParamCompInt8BlockEpilogue {
-  void* scalesB;
-  BTLA_DTYPE scaleBdtype;
-  int ldsb;
-  float* scalesA;
-  int ldsa;
-  // optional if A asym
-  uint8_t* zpA = nullptr;
-  void* reduceB = nullptr;
-  BTLA_DTYPE reduceBdtype = BTLA_DTYPE::F32;
-  // optional if B asym
-  int8_t* zpB = nullptr;
-  float* reduceA = nullptr;
-  int K = 1;
-};
-template <BTLA_ISA ISA_T>
-class CompInt8BlockEpilogue {
- public:
-  using Param = ParamCompInt8BlockEpilogue;
-  static BTLA_CODE forward(const int32_t* srcptr, float* dstptr, const int cachestep, const int M_offset,
-                           const int N_offset, const int K_offset, const int M, const int N, const Param& _param,
-                           void* tmpcache, size_t cachesize) {
-    BTLA_CODE ret = BTLA_CODE::NotSupport;
-    float* scab = nullptr;
-    size_t ScaleBTmpSize = N * sizeof(float);
-    size_t ReduceBTmpSize = N * sizeof(float);
-    assert(cachesize >= (ScaleBTmpSize + ReduceBTmpSize));
-    if (_param.scaleBdtype == BTLA_DTYPE::BF16) {
-      auto scache = reinterpret_cast<float*>(tmpcache);
-      ret = kernel::wrapper::Memcpy2DBf16CvtFp32::template forward<ISA_T>(
-          reinterpret_cast<utils::bf16*>(_param.scalesB) + N_offset + K_offset * _param.ldsb, scache, 1, N, N, N,
-          false);
-      assert(ret == BTLA_CODE::Success);
-      scab = scache;
-    } else if (_param.scaleBdtype == BTLA_DTYPE::F32) {
-      scab = reinterpret_cast<float*>(_param.scalesB) + N_offset + K_offset * _param.ldsb;
-    }
-    float* redb = nullptr;
-    if (_param.reduceB) {
-      if (_param.reduceBdtype == BTLA_DTYPE::BF16) {
-        auto rcache = reinterpret_cast<float*>(reinterpret_cast<char*>(tmpcache) + ScaleBTmpSize);
-        ret = kernel::wrapper::Memcpy2DBf16CvtFp32::template forward<ISA_T>(
-            reinterpret_cast<utils::bf16*>(_param.reduceB) + N_offset + K_offset * _param.ldsb, rcache, 1, N, N, N,
-            false);
-        assert(ret == BTLA_CODE::Success);
-        redb = rcache;
-      } else if (_param.reduceBdtype == BTLA_DTYPE::F32) {
-        redb = reinterpret_cast<float*>(_param.reduceB) + N_offset + K_offset * _param.ldsb;
-      }
-    }
-    ret = kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(
-        srcptr, cachestep, reinterpret_cast<float*>(const_cast<int32_t*>(srcptr)), cachestep, M, N,
-        _param.scalesA + M_offset * _param.ldsa + K_offset, _param.ldsa, scab);
-    assert(ret == BTLA_CODE::Success);
-    ret = kernel::wrapper::AccumulateFp32::template forward<ISA_T>(reinterpret_cast<const float*>(srcptr), cachestep,
-                                                                   dstptr, cachestep, M, N);
-    assert(ret == BTLA_CODE::Success);
-
-    if (_param.zpA == nullptr) {
-      if (_param.zpB == nullptr) {
-        return ret;
-      } else {
-        ret = kernel::wrapper::RemoveZeroPointBias::template forward_wei<ISA_T>(
-            dstptr, cachestep, M, N, _param.zpB + N_offset + K_offset * _param.ldsb, scab, _param.ldsa,
-            _param.reduceA + M_offset * _param.ldsa + K_offset);
-      }
-    } else {
-      if (_param.zpB == nullptr) {
-        ret = kernel::wrapper::RemoveZeroPointBias::template forward_act<ISA_T>(
-            dstptr, cachestep, M, N, _param.zpA + M_offset * _param.ldsa + K_offset,
-            _param.scalesA + M_offset * _param.ldsa + K_offset, _param.ldsa, redb);
-      } else {
-        ret = kernel::wrapper::RemoveZeroPointBias::template forward_both<ISA_T>(
-            dstptr, cachestep, M, N, _param.zpA + M_offset * _param.ldsa + K_offset,
-            _param.zpB + N_offset + K_offset * _param.ldsb, _param.scalesA + M_offset * _param.ldsa + K_offset, scab,
-            _param.ldsa, _param.K, _param.reduceA + M_offset * _param.ldsa + K_offset, redb);
-      }
-    }
-    return ret;
-  }
-};
-
 struct ParamZpDequantInt32ToFp32 {
   // necessary
   float* C;
@@ -282,10 +275,10 @@ struct ParamZpDequantInt32ToFp32 {
   float* reduceA = nullptr;
   int K = 1;
 };
-template <BTLA_ISA ISA_T>
 class ZpDequantInt32ToFp32 {
  public:
   using Param = ParamZpDequantInt32ToFp32;
+  template <BTLA_ISA ISA_T>
   static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset,
                            const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
@@ -323,10 +316,10 @@ struct ParamAlphaBetaProcessS32U8 {
   float scaleAcc, scaleC;
   int zpC;
 };
-template <BTLA_ISA ISA_T>
 class AlphaBetaProcessS32U8 {
  public:
   using Param = ParamAlphaBetaProcessS32U8;
+  template <BTLA_ISA ISA_T>
   static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset,
                            const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
diff --git a/bestla/bestla/bestla_gemm.h b/bestla/bestla/bestla_gemm.h
index 3316127f5..fe521c4ab 100644
--- a/bestla/bestla/bestla_gemm.h
+++ b/bestla/bestla/bestla_gemm.h
@@ -4816,7 +4816,7 @@ class CoreCodeBase {
   static auto constexpr KTILE = Code::KTILE;
   static auto constexpr PACK_ROW = Code::PackRow;
   static auto constexpr COMP = Code::COMPUTE;
-  static int constexpr PREFERRED_N = NTILE * 3;
+  static int constexpr PREFERRED_N = NTILE * 4;
   static auto constexpr ISA = Code::ISA;
   static auto constexpr ID = CoreAttr::make_core_id(NTILE, PACK_ROW, COMP, ISA);
   void configure(int _M, int _N, int _K) { (void)(0); }
@@ -4842,7 +4842,7 @@ class CoreCodeBaseAMX {
   static auto constexpr KTILE = Code::KTILE;
   static auto constexpr PACK_ROW = Code::PackRow;
   static auto constexpr COMP = Code::COMPUTE;
-  static int constexpr PREFERRED_N = NTILE * 3;
+  static int constexpr PREFERRED_N = NTILE * 4;
   static auto constexpr ISA = Code::ISA;
   static auto constexpr ID = CoreAttr::make_core_id(_NTILE, PACK_ROW, COMP, ISA);
   Xbyak::CodeGenerator cfgcode;
diff --git a/bestla/bestla/bestla_parallel.h b/bestla/bestla/bestla_parallel.h
index b60a81d89..04a310a5e 100644
--- a/bestla/bestla/bestla_parallel.h
+++ b/bestla/bestla/bestla_parallel.h
@@ -212,12 +212,14 @@ class StdThreading : public IThreading {
       memcpy(reinterpret_cast<void*>(core_order.data() + _cd->getPcoreNum() + _cd->getEcoreNum()),
              reinterpret_cast<void*>(_cd->getSMTCores()), _cd->getSMTcoreNum() * sizeof(int));
     } else {
-      core_order.resize(mThreadNum);
+      core_order.resize(_cd->getCores() * 2);  // *2 for SMT
       if (_cd->isClient()) {
-        for (int i = 0; i < _cd->getCores(); i++) core_order[i] = 2 * i;
-        for (int i = _cd->getCores(); i < mThreadNum; i++) core_order[i] = 2 * (i - _cd->getCores()) + 1;
+        for (int i = 0; i < _cd->getCores(); i++) {
+          core_order[i] = 2 * i;
+          core_order[i + _cd->getCores()] = 2 * i + 1;
+        }
       } else {
-        for (int i = 0; i < mThreadNum; i++) core_order[i] = i;
+        for (int i = 0; i < _cd->getCores() * 2; i++) core_order[i] = i;
       }
     }
     _cd->core_bond(core_order[0]);
@@ -483,8 +485,8 @@ class SchedulerBase : public Scheduler2D {
     update_cache_blocking();
     Scheduler2D::set(mThdSize, mSize, mStep);
     mL2Use = static_cast<size_t>(mBlock[0]) * mBlock[1] * mEleSize[2];
-    mL2Use += static_cast<size_t>(mBlock[1]) * mBlock[2] * mEleSize[1];
-    mL2Use += static_cast<size_t>(mStep[0]) * mBlock[2] * mEleSize[0];
+    mL2Use += static_cast<size_t>(mBlock[1]) * mBlock[2] * mEleSize[1] * 2;
+    mL2Use += static_cast<size_t>(mStep[0]) * mBlock[2] * mEleSize[0] * 2;
   }
   static float constexpr DensityThres = 16;
   static size_t constexpr ReservedSize = 32ULL * 1024ULL;
@@ -520,11 +522,11 @@ class SchedulerBase : public Scheduler2D {
   }
 
   virtual void cache_blocking_compute() {
-    int constexpr KRef = 256;
+    size_t constexpr KRef = 256;
     size_t valid_total = mL2Size - ReservedSize;
-    auto asize = mStep[0] * KRef * mEleSize[0];
-    size_t csize_total = valid_total - _GemmCore_T::PREFERRED_N * KRef * mEleSize[1] - asize;
-    int maxM = static_cast<int>(csize_total / _GemmCore_T::PREFERRED_N / mEleSize[2]);
+    size_t asize = KRef * mStep[0] * mEleSize[0] * 2;
+    size_t bsize = _GemmCore_T::PREFERRED_N * KRef * mEleSize[1] * 2;
+    int maxM = static_cast<int>((valid_total - bsize - asize) / (_GemmCore_T::PREFERRED_N * mEleSize[2]));
     maxM = utils::downdiv(maxM, mStep[0]);
     int nthdm = mThdSize[0] / mStep[0];
     if (maxM < nthdm) {
@@ -533,7 +535,7 @@ class SchedulerBase : public Scheduler2D {
     } else {
       mBlock[0] = mThdSize[0];
     }
-    int maxN = static_cast<int>((valid_total - asize) / (mBlock[0] * mEleSize[2] + KRef * mEleSize[1]));
+    int maxN = static_cast<int>((valid_total - asize) / (mBlock[0] * mEleSize[2] + KRef * mEleSize[1] * 2));
     maxN = utils::downdiv(maxN, mStep[1]);
     int nthdn = mThdSize[1] / mStep[1];
     if (maxN < nthdn) {
@@ -542,8 +544,9 @@ class SchedulerBase : public Scheduler2D {
     } else {
       mBlock[1] = mThdSize[1];
     }
-    auto rawk = static_cast<int>((valid_total - mBlock[0] * mBlock[1] * mEleSize[2]) /
-                                 (mStep[0] * mEleSize[0] + mBlock[1] * mEleSize[1]));
+    bsize = KRef * mBlock[1] * mEleSize[1] * 2;
+    size_t csize = static_cast<size_t>(mBlock[0]) * mBlock[1] * mEleSize[2];
+    auto rawk = static_cast<int>((valid_total - csize) / (mStep[0] * mEleSize[0] + mBlock[1] * mEleSize[1]) / 2);
     rawk = std::min(rawk, mSizePadded[2]);
     mBlock[2] = utils::padto_le(rawk, mStep[2]);
   }
@@ -570,195 +573,6 @@ class SchedulerBase : public Scheduler2D {
   int mBlock[3] = {0, 0, 0};
 };
 
-template <class _GemmCore_T>
-class SchedulerKBlock : public Scheduler2D {
-  // Block[2]: block size of K must be multiplier of mKBlock
-  //           or factor of mKBlock
- public:
-  using ThreadProblem = ThreadProblemBase;
-  SchedulerKBlock() = default;
-  SchedulerKBlock(const Config& config) { update(config); }
-  virtual void getIndex(ThreadProblem& problem) {
-    problem.stacksize = mL2Size;
-    problem.tmpcachesize = mL2Size - mL2Use;
-    problem.block[0] = mBlock[0];
-    problem.block[1] = mBlock[1];
-    problem.block[2] = mBlock[2];
-    Scheduler2D::getIndex(problem);
-  }
-
-  void update(const Config& config) {
-    for (size_t i = 0; i < 3; i++) {
-      mSize[i] = config.problem.dims[i + 1];
-      mSizePadded[i] = utils::padto(mSize[i], mStep[i]);
-    }
-    mThdCount = config.threads;
-    mL2Size = config.l2cache;
-    mL1Size = config.l1cache;
-    moffset[0] = config.offset[0];
-    moffset[1] = config.offset[1];
-    mKBlock = config.problem.dims[4];
-    if (mSize[0] <= 0 || mSize[1] <= 0 || mSize[2] <= 0) {
-      return;
-    }
-    schedule();
-    assert(this->mL2Use <= this->mL2Size);
-    assert(this->mBlock[0] > 0);
-    assert(this->mBlock[1] > 0);
-    assert(this->mBlock[2] > 0);
-  }
-
-  constexpr static BTLA_ISA gemm_ISA() { return _GemmCore_T::ISA; }
-
-  constexpr int valid_theads() { return mThdValid; }
-
-  void print() {
-    printf("Thread Block:(%d,%d)\n", mThdSize[0], mThdSize[1]);
-    printf("Thread in use:%d of %d, Nx%d\n", mThdValid, mThdCount, mThdPerRow);
-    printf("GEMM MStep:%d NStep:%d KStep:%d\n", mBlock[0], mBlock[1], mBlock[2]);
-    printf("Cache Size:%zu used:%zu\n", mL2Size, mL2Use);
-  }
-
-  template <class T>
-  friend class SchedulerDispatcher;
-
- protected:
-  void schedule() {
-    int rownum = utils::updiv(mSize[0], mStep[0]);
-    int colnum = utils::updiv(mSize[1], mStep[1]);
-    mDensity = static_cast<float>(mSize[0]) * mSize[1] / (mSize[0] + mSize[1]);
-    int maxN = 0;
-    float maxScore = std::numeric_limits<float>::min();
-    int core_enum = static_cast<int>(std::sqrt(mThdCount));
-    for (int i = 1; i <= core_enum; i += 1) {
-      generate_by_cores(i, mThdCount / i, rownum, colnum);
-      auto thdscore = calculate_score();
-      if (maxScore < thdscore) {
-        maxScore = thdscore;
-        maxN = i;
-      }
-      generate_by_cores(mThdCount / i, i, rownum, colnum);
-      thdscore = calculate_score();
-      if (maxScore < thdscore) {
-        maxScore = thdscore;
-        maxN = mThdCount / i;
-      }
-    }
-    generate_by_cores(maxN, mThdCount / maxN, rownum, colnum);
-    update_cache_blocking();
-    Scheduler2D::set(mThdSize, mSize, mStep);
-    mL2Use = static_cast<size_t>(mBlock[0]) * mBlock[1] * mEleSize[2] * 2;
-    mL2Use += static_cast<size_t>(mBlock[1]) * mBlock[2] * mEleSize[1];
-    mL2Use += static_cast<size_t>(mStep[0]) * mBlock[2] * mEleSize[0];
-  }
-  static float constexpr DensityThres = 16;
-
-  float calculate_score() {
-    int tmpnstep = mThdSize[1] < _GemmCore_T::PREFERRED_N ? mThdSize[1] : _GemmCore_T::PREFERRED_N;
-    float threadratio = static_cast<float>(mThdValid) / mThdCount;
-    float density = static_cast<float>(tmpnstep) * mThdSize[0] / (tmpnstep + mThdSize[0]);
-    if (mDensity < DensityThres) {
-      return threadratio * 1.f;
-    }
-    return (threadratio * 1.f + density * 0.0016f);
-  }
-
-  void generate_by_cores(int ny, int nx, int rownum, int colnum) {
-    mThdSize[0] = utils::updiv(rownum, ny) * mStep[0];
-    mThdSize[1] = utils::updiv(colnum, nx) * mStep[1];
-    mThdPerRow = utils::updiv(mSize[1], mThdSize[1]);
-    mThdValid = utils::updiv(mSize[0], mThdSize[0]) * mThdPerRow;
-  }
-
-  // C-KBlock Accumulator=MBlock*NBlock
-  // C-K Accumulator=MBlock*NBlock
-  // B=MBlock*KBlock
-  // A=MTILE*KBlock
-  void update_cache_blocking() {
-    if (mDensity <= DensityThres) {
-      return cache_blocking_memory();
-    } else {
-      return cache_blocking_compute();
-    }
-  }
-
-  void cache_blocking_compute() {
-    int constexpr KRef = 256;
-    int constexpr NRef = _GemmCore_T::PREFERRED_N;
-    int constexpr MTile = _GemmCore_T::MTILE;
-    int constexpr KSplitStage = 16;
-    int BlkNum = utils::updiv(mSize[2], mKBlock);
-    int KSplitSize = utils::padto(utils::updiv(mSize[2], KSplitStage), mStep[2]);
-    mBlock[1] = NRef < mThdSize[1] ? NRef : mThdSize[1];
-    if (KSplitStage * mStep[2] >= mSize[2]) {
-      mBlock[2] = mSize[2];
-    } else if (KSplitSize >= mKBlock) {
-      mBlock[2] = mKBlock;
-    } else {
-      int scale = utils::downdiv(KSplitStage, BlkNum);
-      for (; scale >= 1; scale--) {
-        if (mKBlock % scale == 0) {
-          break;
-        }
-      }
-      mBlock[2] = utils::downdiv(mKBlock, scale);
-      mBlock[2] = utils::padto_le(mBlock[2], mStep[2]);
-    }
-    size_t size_remain = mL2Size - mBlock[1] * mBlock[2] * mEleSize[1];
-    // MBlock*KBlock*ASize+MBlock*NBlock*CSize*2<=size_remain
-    int maxMBlock = static_cast<int>(size_remain / (mBlock[1] * mEleSize[2] * 2 + mBlock[2] * mEleSize[0]));
-    int maxM = utils::downdiv(maxMBlock, mStep[0]);
-    int nthdm = mThdSize[0] / mStep[0];
-    if (maxM < nthdm) {
-      int niter = utils::updiv(nthdm, maxM);
-      mBlock[0] = utils::updiv(nthdm, niter) * mStep[0];
-    } else {
-      mBlock[0] = mThdSize[0];
-    }
-  }
-
-  void cache_blocking_memory() {
-    mBlock[0] = _GemmCore_T::MTILE;
-    size_t startK = std::max(16, _GemmCore_T::KTILE);
-    auto getMaxN = [&](size_t refk) {
-      size_t sizeA = refk * mEleSize[0] * mBlock[0];
-      size_t maxN = (mL1Size - sizeA) / (mBlock[0] * mEleSize[2] * 2 + refk * mEleSize[1]);
-      return maxN;
-    };
-    auto getMaxK = [&](size_t refN) {
-      size_t sizeC = refN * mEleSize[2] * mBlock[0] * 2;
-      size_t maxK = (mL1Size - sizeC) / (mBlock[0] * mEleSize[0] + refN * mEleSize[1]);
-      return maxK;
-    };
-    auto maxN = getMaxN(startK);
-    if (maxN <= mThdSize[1]) {
-      mBlock[1] = static_cast<int>(maxN);
-      mBlock[1] = utils::padto_le(mBlock[1], mStep[1]);
-      mBlock[2] = static_cast<int>(startK);
-    } else {
-      mBlock[1] = mThdSize[1];
-      mBlock[2] = static_cast<int>(getMaxK(mBlock[1]));
-      mBlock[2] = utils::padto_le(mBlock[2], mStep[2]);
-      mBlock[2] = std::min(mKBlock, mBlock[2]);
-      auto tmp = utils::updiv(mKBlock, mBlock[2]);
-      while (mKBlock % tmp != 0) tmp++;  // TODO(Yu) optimize
-      mBlock[2] = utils::downdiv(mKBlock, tmp);
-    }
-  }
-  size_t mL2Size = 0, mL1Size = 0, mL2Use = 0;
-  float mDensity = 0.f;
-  int mKBlock = 0;
-
- private:
-  int mSize[3] = {0, 0, 0};
-  int mThdSize[3] = {0, 0, 0};
-  static constexpr int mStep[3] = {_GemmCore_T::MTILE, _GemmCore_T::NTILE, _GemmCore_T::KTILE};
-  static constexpr int mEleSize[3] = {sizeof(typename _GemmCore_T::AType), sizeof(typename _GemmCore_T::BType),
-                                      sizeof(typename _GemmCore_T::CType)};
-  int mSizePadded[3] = {0, 0, 0};
-  int mBlock[3] = {0, 0, 0};
-};
-
 template <class _GemmCore_T>
 class SchedulerKBlockS : public SchedulerBase<_GemmCore_T> {
   // Block[2]: block size of K must be multiplier of mKBlock
@@ -792,12 +606,14 @@ class SchedulerKBlockS : public SchedulerBase<_GemmCore_T> {
   static size_t constexpr ReservedSize = 32ULL * 1024ULL;
 
   void cache_blocking_compute() override {
-    int constexpr KRef = 256;
-    int constexpr CorSize = sizeof(float) + sizeof(int8_t) + sizeof(float);
+    size_t constexpr KRef = 256;
+    size_t constexpr CorSize = sizeof(float) + sizeof(int8_t) + sizeof(float);
     size_t valid_total = this->mL2Size - ReservedSize;
     auto blks = utils::updiv(KRef, this->mKBlock);
-    auto asize = this->mStep[0] * KRef * this->mEleSize[0] + this->mStep[0] * blks * CorSize;
+    auto asize = KRef * this->mStep[0] * this->mEleSize[0] + blks * this->mStep[0] * CorSize;
+    asize *= 2;
     auto bsize = _GemmCore_T::PREFERRED_N * KRef * this->mEleSize[1] + _GemmCore_T::PREFERRED_N * blks * CorSize;
+    asize *= 2;
     size_t csize_total = valid_total - asize - bsize;
     int maxM = static_cast<int>(csize_total / _GemmCore_T::PREFERRED_N / this->mEleSize[2]);
     maxM = utils::downdiv(maxM, this->mStep[0]);
@@ -808,8 +624,8 @@ class SchedulerKBlockS : public SchedulerBase<_GemmCore_T> {
     } else {
       this->mBlock[0] = this->mThdSize[0];
     }
-    int maxN = static_cast<int>((valid_total - asize) /
-                                (this->mBlock[0] * this->mEleSize[2] + KRef * this->mEleSize[1] + blks * CorSize));
+    int maxN = static_cast<int>((valid_total - asize) / (this->mBlock[0] * this->mEleSize[2] +
+                                                         (KRef * this->mEleSize[1] + blks * CorSize) * 2));
     maxN = utils::downdiv(maxN, this->mStep[1]);
     int nthdn = this->mThdSize[1] / this->mStep[1];
     if (maxN < nthdn) {
@@ -818,13 +634,13 @@ class SchedulerKBlockS : public SchedulerBase<_GemmCore_T> {
     } else {
       this->mBlock[1] = this->mThdSize[1];
     }
-    auto rawk = static_cast<int>((valid_total - this->mBlock[0] * this->mBlock[1] * this->mEleSize[2]) /
+    size_t csize = static_cast<size_t>(this->mBlock[0]) * this->mBlock[1] * this->mEleSize[2];
+    auto rawk = static_cast<int>((valid_total - csize) / 2 /
                                  (this->mStep[0] * this->mEleSize[0] +
                                   float(CorSize * (this->mStep[0] + this->mBlock[1])) / this->mKBlock +
                                   this->mBlock[1] * this->mEleSize[1]));
     if (rawk < this->mKBlock) {
-      rawk = static_cast<int>((valid_total - this->mBlock[0] * this->mBlock[1] * this->mEleSize[2] -
-                               1 * CorSize * (this->mStep[0] + this->mBlock[1])) /
+      rawk = static_cast<int>((valid_total - csize - 1 * CorSize * (this->mStep[0] + this->mBlock[1])) / 2 /
                               (this->mStep[0] * this->mEleSize[0] + this->mBlock[1] * this->mEleSize[1]));
     }
     rawk = std::min(rawk, this->mSizePadded[2]);
diff --git a/bestla/bestla/bestla_wrapper.h b/bestla/bestla/bestla_wrapper.h
index af2c93675..00c7e8d40 100644
--- a/bestla/bestla/bestla_wrapper.h
+++ b/bestla/bestla/bestla_wrapper.h
@@ -200,18 +200,73 @@ class S1 {
   }
 };
 
+class NBitsHelper {
+ public:
+  template <typename ScaleT>
+  static inline utils::GemvParamB<ScaleT> createB(storage::gemm::StorageWeightKBlockNInteger* packedW) {
+    if (packedW->mDType == BTLA_DTYPE::S4_CLIP) {
+      return S4::createB<ScaleT>(packedW);
+    }
+    if (packedW->mDType == BTLA_DTYPE::S3_CLIP) {
+      return S3::createB<ScaleT>(packedW);
+    }
+    if (packedW->mDType == BTLA_DTYPE::S5_CLIP) {
+      return S5::createB<ScaleT>(packedW);
+    }
+    if (packedW->mDType == BTLA_DTYPE::S2_CLIP) {
+      return S2::createB<ScaleT>(packedW);
+    }
+    if (packedW->mDType == BTLA_DTYPE::S6_CLIP) {
+      return S6::createB<ScaleT>(packedW);
+    }
+    if (packedW->mDType == BTLA_DTYPE::S7_CLIP) {
+      return S7::createB<ScaleT>(packedW);
+    }
+    if (packedW->mDType == BTLA_DTYPE::S1_CLIP) {
+      return S1::createB<ScaleT>(packedW);
+    }
+    assert(0);
+    return utils::GemvParamB<ScaleT>();
+  }
+  template <typename ScaleT>
+  static void updateBNStep(utils::GemvParamB<ScaleT>& paramB, int n_offset) {
+    if (paramB.nbits == 4) {
+      return S4::updateBNStep<ScaleT>(paramB, n_offset);
+    }
+    if (paramB.nbits == 3) {
+      return S3::updateBNStep<ScaleT>(paramB, n_offset);
+    }
+    if (paramB.nbits == 5) {
+      return S5::updateBNStep<ScaleT>(paramB, n_offset);
+    }
+    if (paramB.nbits == 2) {
+      return S2::updateBNStep<ScaleT>(paramB, n_offset);
+    }
+    if (paramB.nbits == 6) {
+      return S6::updateBNStep<ScaleT>(paramB, n_offset);
+    }
+    if (paramB.nbits == 7) {
+      return S7::updateBNStep<ScaleT>(paramB, n_offset);
+    }
+    if (paramB.nbits == 1) {
+      return S1::updateBNStep<ScaleT>(paramB, n_offset);
+    }
+    assert(0);
+  }
+};
+
 }  // namespace gemv_nbits
 
 namespace gemm {
 template <BTLA_ISA _RT_ISA_T, class _GemmCore_T, template <class _T, BTLA_ISA> class _PrologueA_T,
-          template <class _T, BTLA_ISA> class _PrologueB_T, template <BTLA_ISA> class _Epilogue_T>
+          template <class _T, BTLA_ISA> class _PrologueB_T, class _Epilogue_T>
 class LauncherBase {
  public:
   using GemmCore = _GemmCore_T;
   static constexpr BTLA_ISA ISA = _RT_ISA_T;
   using PrologueA = _PrologueA_T<GemmCore, _RT_ISA_T>;
   using PrologueB = _PrologueB_T<GemmCore, _RT_ISA_T>;
-  using Epilogue = _Epilogue_T<_RT_ISA_T>;
+  using Epilogue = _Epilogue_T;
   using AType = typename GemmCore::AType;
   using AParam = typename PrologueA::Param;
   using BType = typename GemmCore::BType;
@@ -228,7 +283,6 @@ class LauncherBase {
   _GemmCore_T mGemmCore;
   PrologueA mProA;
   PrologueB mProB;
-  Epilogue mEpilogue;
 
   class GEMVWrapper {
    public:
@@ -239,22 +293,53 @@ class LauncherBase {
       if constexpr (!std::is_same_v<PrologueA,
                                     prologue_a::gemm::ShuffleActivationKBlockBaseF32<_GemmCore_T, _RT_ISA_T>> &&
                     !std::is_same_v<PrologueA, prologue_a::gemm::ActivationKBlockBaseF32<_GemmCore_T, _RT_ISA_T>> &&
+                    !std::is_same_v<PrologueA, prologue_a::gemm::ActivationF32KBlockQuantize<_GemmCore_T, _RT_ISA_T>> &&
+                    !std::is_same_v<PrologueA,
+                                    prologue_a::gemm::ShuffleActivationKBlockQuantizeF32<_GemmCore_T, _RT_ISA_T>> &&
                     !std::is_same_v<PrologueA, prologue_a::gemm::ActivationBase<_GemmCore_T, _RT_ISA_T>>) {
         return false;
       }
 
       if constexpr (GemmCore::ISA == BTLA_ISA::AVX2) {
 #if CompileAVX2()
-        static_assert(GemmCore::PACK_ROW == 1);
         if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_FP32) {
+          static_assert(GemmCore::PACK_ROW == 1);
+          return true;
+        }
+        if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_INT8_US_INT32) {
+          static_assert(GemmCore::PACK_ROW == 4);
+          return true;
+        }
+#endif
+      }
+      if constexpr (GemmCore::ISA == BTLA_ISA::AVX512_VNNI || GemmCore::ISA == BTLA_ISA::AMX_INT8) {
+#if CompileAVX512VNNI()
+        if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_INT8_US_INT32) {
+          static_assert(GemmCore::PACK_ROW == 4);
+          return true;
+        }
+#endif
+      }
+      if constexpr (GemmCore::ISA == BTLA_ISA::AVX_VNNI) {
+#if CompileAVXVNNI()
+        if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_INT8_US_INT32) {
+          static_assert(GemmCore::PACK_ROW == 4);
           return true;
         }
 #endif
       }
       if constexpr (GemmCore::ISA == BTLA_ISA::AVX512F) {
 #if CompileAVX512F()
-        static_assert(GemmCore::PACK_ROW == 1);
         if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_FP32) {
+          static_assert(GemmCore::PACK_ROW == 1);
+          return true;
+        }
+#endif
+      }
+      if constexpr (GemmCore::ISA == BTLA_ISA::AVX512BW) {
+#if CompileAVX512F()
+        if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_INT8_US_INT32) {
+          static_assert(GemmCore::PACK_ROW == 4);
           return true;
         }
 #endif
@@ -281,46 +366,69 @@ class LauncherBase {
       return impl;
     }
 
-    template <typename ScaleT, int MTILE, class SNbits>
+    template <typename ScaleT, int MTILE>
     static void gemv_kblock(const Param& _param, const parallel::gemm::ThreadProblemBase& _config) {
       if constexpr (support()) {
         auto constexpr TmpSize = 16 * 1024LL;
         auto constexpr CSize = 8 * 1024LL;
         auto StackTmp_ = alloca(TmpSize + CSize);
         auto StackTmp = utils::cpu_pointer_align<void>(StackTmp_);
-        auto tmpc_ptr = reinterpret_cast<CType*>((char*)StackTmp + TmpSize);
+        auto tmpc_ptr = reinterpret_cast<float*>((char*)StackTmp + TmpSize);
         static_assert(CSize >= (MTILE * GemmCore::NTILE * sizeof(float)));
-        utils::GemvParamB<ScaleT> paramB = SNbits::template createB<ScaleT>(_param.paramB.packedW);
-        const float* Aptr = _param.paramA.A;
-        if constexpr (std::is_same_v<PrologueA,
-                                     prologue_a::gemm::ShuffleActivationKBlockBaseF32<_GemmCore_T, _RT_ISA_T>>) {
-          if (_param.paramA.reordered && _param.paramA.reordered->template APtr<float>()) {
-            Aptr = _param.paramA.reordered->template APtr<float>();
-          }
-        }
+        utils::GemvParamB<ScaleT> paramB = gemv_nbits::NBitsHelper::template createB<ScaleT>(_param.paramB.packedW);
         int m = _param.problem.dims[1];
         int n = _param.problem.dims[2];
         int k = _param.problem.dims[3];
         int kblocksize = _param.problem.dims[4];
-        SNbits::template updateBNStep<ScaleT>(paramB, _config.loc[1]);
+        gemv_nbits::NBitsHelper::template updateBNStep<ScaleT>(paramB, _config.loc[1]);
         int size_padded = utils::padto_le(_config.size[1], GemmCore::NTILE);
         int in = 0;
         for (; in < size_padded; in += GemmCore::NTILE) {
-          if constexpr (std::is_same_v<AType, float>) {
+          if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_INT8_US_INT32) {
+            utils::GemvParamA paramA{
+                _param.paramA.quan->template APtr<uint8_t>(), _param.paramA.quan->template SPtr<float>(),
+                _param.paramA.quan->template ZPtr<uint8_t>(), _param.paramA.quan->mKPad, _param.paramA.quan->CStep()};
+            kernel::wrapper::GEMVWoqNBits::forward_u8s8_fp32<_RT_ISA_T, ScaleT, GemmCore::NTILE, MTILE>(
+                paramA, paramB, tmpc_ptr, GemmCore::NTILE, k, kblocksize, StackTmp, TmpSize);
+            Epilogue::Fp32Epi::template forward<ISA>(tmpc_ptr, GemmCore::NTILE, 0, _config.loc[1] + in, MTILE,
+                                                     GemmCore::NTILE, _param.paramC.param2, StackTmp, TmpSize);
+          } else {
+            const float* Aptr = _param.paramA.A;
+            if constexpr (std::is_same_v<PrologueA,
+                                         prologue_a::gemm::ShuffleActivationKBlockBaseF32<_GemmCore_T, _RT_ISA_T>>) {
+              if (_param.paramA.reordered && _param.paramA.reordered->template APtr<float>()) {
+                Aptr = _param.paramA.reordered->template APtr<float>();
+              }
+            }
             kernel::wrapper::GEMVWoqNBits::forward_fp32_fp32<_RT_ISA_T, ScaleT, GemmCore::NTILE, MTILE>(
                 Aptr, _param.paramA.lda, paramB, tmpc_ptr, GemmCore::NTILE, k, kblocksize, StackTmp, TmpSize);
+            Epilogue::template forward<ISA>(tmpc_ptr, GemmCore::NTILE, 0, _config.loc[1] + in, MTILE, GemmCore::NTILE,
+                                            _param.paramC, StackTmp, TmpSize);
           }
-          Epilogue::forward(tmpc_ptr, GemmCore::NTILE, 0, _config.loc[1] + in, MTILE, GemmCore::NTILE, _param.paramC,
-                            StackTmp, TmpSize);
-          SNbits::template updateBNStep<ScaleT>(paramB, GemmCore::NTILE);
+          gemv_nbits::NBitsHelper::template updateBNStep<ScaleT>(paramB, GemmCore::NTILE);
         }
         if (size_padded != _config.size[1]) {
-          if constexpr (std::is_same_v<AType, float>) {
+          if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_INT8_US_INT32) {
+            utils::GemvParamA paramA{
+                _param.paramA.quan->template APtr<uint8_t>(), _param.paramA.quan->template SPtr<float>(),
+                _param.paramA.quan->template ZPtr<uint8_t>(), _param.paramA.quan->mKPad, _param.paramA.quan->CStep()};
+            kernel::wrapper::GEMVWoqNBits::forward_u8s8_fp32<_RT_ISA_T, ScaleT, GemmCore::NTILE, MTILE>(
+                paramA, paramB, tmpc_ptr, GemmCore::NTILE, k, kblocksize, StackTmp, TmpSize);
+            Epilogue::Fp32Epi::template forward<ISA>(tmpc_ptr, GemmCore::NTILE, 0, _config.loc[1] + in, MTILE,
+                                                     (_config.size[1] - in), _param.paramC.param2, StackTmp, TmpSize);
+          } else {
+            const float* Aptr = _param.paramA.A;
+            if constexpr (std::is_same_v<PrologueA,
+                                         prologue_a::gemm::ShuffleActivationKBlockBaseF32<_GemmCore_T, _RT_ISA_T>>) {
+              if (_param.paramA.reordered && _param.paramA.reordered->template APtr<float>()) {
+                Aptr = _param.paramA.reordered->template APtr<float>();
+              }
+            }
             kernel::wrapper::GEMVWoqNBits::forward_fp32_fp32<_RT_ISA_T, ScaleT, GemmCore::NTILE, MTILE>(
                 Aptr, _param.paramA.lda, paramB, tmpc_ptr, GemmCore::NTILE, k, kblocksize, StackTmp, TmpSize);
+            Epilogue::template forward<ISA>(tmpc_ptr, GemmCore::NTILE, 0, _config.loc[1] + in, MTILE,
+                                            (_config.size[1] - in), _param.paramC, StackTmp, TmpSize);
           }
-          Epilogue::forward(tmpc_ptr, GemmCore::NTILE, 0, _config.loc[1] + in, MTILE, (_config.size[1] - in),
-                            _param.paramC, StackTmp, TmpSize);
         }
       }
     }
@@ -329,187 +437,28 @@ class LauncherBase {
       if constexpr (support()) {
         assert(_param.problem.dims[4] > 0);
         auto& m = _param.problem.dims[1];
-        if (_param.paramB.packedW->mDType == BTLA_DTYPE::S4_CLIP) {
-          if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F32) {
-            if (m == 1) gemv_kblock<float, 1, gemv_nbits::S4>(_param, _config);
-            if (m == 2) gemv_kblock<float, 2, gemv_nbits::S4>(_param, _config);
-            if (m == 3) gemv_kblock<float, 3, gemv_nbits::S4>(_param, _config);
-            if (m == 4) gemv_kblock<float, 4, gemv_nbits::S4>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<float, 5, gemv_nbits::S4>(_param, _config);
-              if (m == 6) gemv_kblock<float, 6, gemv_nbits::S4>(_param, _config);
-              if (m == 7) gemv_kblock<float, 7, gemv_nbits::S4>(_param, _config);
-              if (m == 8) gemv_kblock<float, 8, gemv_nbits::S4>(_param, _config);
-            }
-          } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::BF16) {
-            if (m == 1) gemv_kblock<utils::bf16, 1, gemv_nbits::S4>(_param, _config);
-            if (m == 2) gemv_kblock<utils::bf16, 2, gemv_nbits::S4>(_param, _config);
-            if (m == 3) gemv_kblock<utils::bf16, 3, gemv_nbits::S4>(_param, _config);
-            if (m == 4) gemv_kblock<utils::bf16, 4, gemv_nbits::S4>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<utils::bf16, 5, gemv_nbits::S4>(_param, _config);
-              if (m == 6) gemv_kblock<utils::bf16, 6, gemv_nbits::S4>(_param, _config);
-              if (m == 7) gemv_kblock<utils::bf16, 7, gemv_nbits::S4>(_param, _config);
-              if (m == 8) gemv_kblock<utils::bf16, 8, gemv_nbits::S4>(_param, _config);
-            }
-          }
-          return;
-        }
-        if (_param.paramB.packedW->mDType == BTLA_DTYPE::S5_CLIP) {
-          if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F32) {
-            if (m == 1) gemv_kblock<float, 1, gemv_nbits::S5>(_param, _config);
-            if (m == 2) gemv_kblock<float, 2, gemv_nbits::S5>(_param, _config);
-            if (m == 3) gemv_kblock<float, 3, gemv_nbits::S5>(_param, _config);
-            if (m == 4) gemv_kblock<float, 4, gemv_nbits::S5>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<float, 5, gemv_nbits::S5>(_param, _config);
-              if (m == 6) gemv_kblock<float, 6, gemv_nbits::S5>(_param, _config);
-              if (m == 7) gemv_kblock<float, 7, gemv_nbits::S5>(_param, _config);
-              if (m == 8) gemv_kblock<float, 8, gemv_nbits::S5>(_param, _config);
-            }
-          } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::BF16) {
-            if (m == 1) gemv_kblock<utils::bf16, 1, gemv_nbits::S5>(_param, _config);
-            if (m == 2) gemv_kblock<utils::bf16, 2, gemv_nbits::S5>(_param, _config);
-            if (m == 3) gemv_kblock<utils::bf16, 3, gemv_nbits::S5>(_param, _config);
-            if (m == 4) gemv_kblock<utils::bf16, 4, gemv_nbits::S5>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<utils::bf16, 5, gemv_nbits::S5>(_param, _config);
-              if (m == 6) gemv_kblock<utils::bf16, 6, gemv_nbits::S5>(_param, _config);
-              if (m == 7) gemv_kblock<utils::bf16, 7, gemv_nbits::S5>(_param, _config);
-              if (m == 8) gemv_kblock<utils::bf16, 8, gemv_nbits::S5>(_param, _config);
-            }
-          }
-          return;
-        }
-        if (_param.paramB.packedW->mDType == BTLA_DTYPE::S6_CLIP) {
-          if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F32) {
-            if (m == 1) gemv_kblock<float, 1, gemv_nbits::S6>(_param, _config);
-            if (m == 2) gemv_kblock<float, 2, gemv_nbits::S6>(_param, _config);
-            if (m == 3) gemv_kblock<float, 3, gemv_nbits::S6>(_param, _config);
-            if (m == 4) gemv_kblock<float, 4, gemv_nbits::S6>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<float, 5, gemv_nbits::S6>(_param, _config);
-              if (m == 6) gemv_kblock<float, 6, gemv_nbits::S6>(_param, _config);
-              if (m == 7) gemv_kblock<float, 7, gemv_nbits::S6>(_param, _config);
-              if (m == 8) gemv_kblock<float, 8, gemv_nbits::S6>(_param, _config);
-            }
-          } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::BF16) {
-            if (m == 1) gemv_kblock<utils::bf16, 1, gemv_nbits::S6>(_param, _config);
-            if (m == 2) gemv_kblock<utils::bf16, 2, gemv_nbits::S6>(_param, _config);
-            if (m == 3) gemv_kblock<utils::bf16, 3, gemv_nbits::S6>(_param, _config);
-            if (m == 4) gemv_kblock<utils::bf16, 4, gemv_nbits::S6>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<utils::bf16, 5, gemv_nbits::S6>(_param, _config);
-              if (m == 6) gemv_kblock<utils::bf16, 6, gemv_nbits::S6>(_param, _config);
-              if (m == 7) gemv_kblock<utils::bf16, 7, gemv_nbits::S6>(_param, _config);
-              if (m == 8) gemv_kblock<utils::bf16, 8, gemv_nbits::S6>(_param, _config);
-            }
+        if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F32) {
+          if (m == 1) gemv_kblock<float, 1>(_param, _config);
+          if (m == 2) gemv_kblock<float, 2>(_param, _config);
+          if (m == 3) gemv_kblock<float, 3>(_param, _config);
+          if (m == 4) gemv_kblock<float, 4>(_param, _config);
+          if constexpr (Reg32) {
+            if (m == 5) gemv_kblock<float, 5>(_param, _config);
+            if (m == 6) gemv_kblock<float, 6>(_param, _config);
+            if (m == 7) gemv_kblock<float, 7>(_param, _config);
+            if (m == 8) gemv_kblock<float, 8>(_param, _config);
           }
-          return;
-        }
-        if (_param.paramB.packedW->mDType == BTLA_DTYPE::S7_CLIP) {
-          if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F32) {
-            if (m == 1) gemv_kblock<float, 1, gemv_nbits::S7>(_param, _config);
-            if (m == 2) gemv_kblock<float, 2, gemv_nbits::S7>(_param, _config);
-            if (m == 3) gemv_kblock<float, 3, gemv_nbits::S7>(_param, _config);
-            if (m == 4) gemv_kblock<float, 4, gemv_nbits::S7>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<float, 5, gemv_nbits::S7>(_param, _config);
-              if (m == 6) gemv_kblock<float, 6, gemv_nbits::S7>(_param, _config);
-              if (m == 7) gemv_kblock<float, 7, gemv_nbits::S7>(_param, _config);
-              if (m == 8) gemv_kblock<float, 8, gemv_nbits::S7>(_param, _config);
-            }
-          } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::BF16) {
-            if (m == 1) gemv_kblock<utils::bf16, 1, gemv_nbits::S7>(_param, _config);
-            if (m == 2) gemv_kblock<utils::bf16, 2, gemv_nbits::S7>(_param, _config);
-            if (m == 3) gemv_kblock<utils::bf16, 3, gemv_nbits::S7>(_param, _config);
-            if (m == 4) gemv_kblock<utils::bf16, 4, gemv_nbits::S7>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<utils::bf16, 5, gemv_nbits::S7>(_param, _config);
-              if (m == 6) gemv_kblock<utils::bf16, 6, gemv_nbits::S7>(_param, _config);
-              if (m == 7) gemv_kblock<utils::bf16, 7, gemv_nbits::S7>(_param, _config);
-              if (m == 8) gemv_kblock<utils::bf16, 8, gemv_nbits::S7>(_param, _config);
-            }
+        } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::BF16) {
+          if (m == 1) gemv_kblock<utils::bf16, 1>(_param, _config);
+          if (m == 2) gemv_kblock<utils::bf16, 2>(_param, _config);
+          if (m == 3) gemv_kblock<utils::bf16, 3>(_param, _config);
+          if (m == 4) gemv_kblock<utils::bf16, 4>(_param, _config);
+          if constexpr (Reg32) {
+            if (m == 5) gemv_kblock<utils::bf16, 5>(_param, _config);
+            if (m == 6) gemv_kblock<utils::bf16, 6>(_param, _config);
+            if (m == 7) gemv_kblock<utils::bf16, 7>(_param, _config);
+            if (m == 8) gemv_kblock<utils::bf16, 8>(_param, _config);
           }
-          return;
-        }
-        if (_param.paramB.packedW->mDType == BTLA_DTYPE::S3_CLIP) {
-          if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F32) {
-            if (m == 1) gemv_kblock<float, 1, gemv_nbits::S3>(_param, _config);
-            if (m == 2) gemv_kblock<float, 2, gemv_nbits::S3>(_param, _config);
-            if (m == 3) gemv_kblock<float, 3, gemv_nbits::S3>(_param, _config);
-            if (m == 4) gemv_kblock<float, 4, gemv_nbits::S3>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<float, 5, gemv_nbits::S3>(_param, _config);
-              if (m == 6) gemv_kblock<float, 6, gemv_nbits::S3>(_param, _config);
-              if (m == 7) gemv_kblock<float, 7, gemv_nbits::S3>(_param, _config);
-              if (m == 8) gemv_kblock<float, 8, gemv_nbits::S3>(_param, _config);
-            }
-          } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::BF16) {
-            if (m == 1) gemv_kblock<utils::bf16, 1, gemv_nbits::S3>(_param, _config);
-            if (m == 2) gemv_kblock<utils::bf16, 2, gemv_nbits::S3>(_param, _config);
-            if (m == 3) gemv_kblock<utils::bf16, 3, gemv_nbits::S3>(_param, _config);
-            if (m == 4) gemv_kblock<utils::bf16, 4, gemv_nbits::S3>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<utils::bf16, 5, gemv_nbits::S3>(_param, _config);
-              if (m == 6) gemv_kblock<utils::bf16, 6, gemv_nbits::S3>(_param, _config);
-              if (m == 7) gemv_kblock<utils::bf16, 7, gemv_nbits::S3>(_param, _config);
-              if (m == 8) gemv_kblock<utils::bf16, 8, gemv_nbits::S3>(_param, _config);
-            }
-          }
-          return;
-        }
-        if (_param.paramB.packedW->mDType == BTLA_DTYPE::S1_CLIP) {
-          if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F32) {
-            if (m == 1) gemv_kblock<float, 1, gemv_nbits::S1>(_param, _config);
-            if (m == 2) gemv_kblock<float, 2, gemv_nbits::S1>(_param, _config);
-            if (m == 3) gemv_kblock<float, 3, gemv_nbits::S1>(_param, _config);
-            if (m == 4) gemv_kblock<float, 4, gemv_nbits::S1>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<float, 5, gemv_nbits::S1>(_param, _config);
-              if (m == 6) gemv_kblock<float, 6, gemv_nbits::S1>(_param, _config);
-              if (m == 7) gemv_kblock<float, 7, gemv_nbits::S1>(_param, _config);
-              if (m == 8) gemv_kblock<float, 8, gemv_nbits::S1>(_param, _config);
-            }
-          } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::BF16) {
-            if (m == 1) gemv_kblock<utils::bf16, 1, gemv_nbits::S1>(_param, _config);
-            if (m == 2) gemv_kblock<utils::bf16, 2, gemv_nbits::S1>(_param, _config);
-            if (m == 3) gemv_kblock<utils::bf16, 3, gemv_nbits::S1>(_param, _config);
-            if (m == 4) gemv_kblock<utils::bf16, 4, gemv_nbits::S1>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<utils::bf16, 5, gemv_nbits::S1>(_param, _config);
-              if (m == 6) gemv_kblock<utils::bf16, 6, gemv_nbits::S1>(_param, _config);
-              if (m == 7) gemv_kblock<utils::bf16, 7, gemv_nbits::S1>(_param, _config);
-              if (m == 8) gemv_kblock<utils::bf16, 8, gemv_nbits::S1>(_param, _config);
-            }
-          }
-          return;
-        }
-        if (_param.paramB.packedW->mDType == BTLA_DTYPE::S2_CLIP) {
-          if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F32) {
-            if (m == 1) gemv_kblock<float, 1, gemv_nbits::S2>(_param, _config);
-            if (m == 2) gemv_kblock<float, 2, gemv_nbits::S2>(_param, _config);
-            if (m == 3) gemv_kblock<float, 3, gemv_nbits::S2>(_param, _config);
-            if (m == 4) gemv_kblock<float, 4, gemv_nbits::S2>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<float, 5, gemv_nbits::S2>(_param, _config);
-              if (m == 6) gemv_kblock<float, 6, gemv_nbits::S2>(_param, _config);
-              if (m == 7) gemv_kblock<float, 7, gemv_nbits::S2>(_param, _config);
-              if (m == 8) gemv_kblock<float, 8, gemv_nbits::S2>(_param, _config);
-            }
-          } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::BF16) {
-            if (m == 1) gemv_kblock<utils::bf16, 1, gemv_nbits::S2>(_param, _config);
-            if (m == 2) gemv_kblock<utils::bf16, 2, gemv_nbits::S2>(_param, _config);
-            if (m == 3) gemv_kblock<utils::bf16, 3, gemv_nbits::S2>(_param, _config);
-            if (m == 4) gemv_kblock<utils::bf16, 4, gemv_nbits::S2>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<utils::bf16, 5, gemv_nbits::S2>(_param, _config);
-              if (m == 6) gemv_kblock<utils::bf16, 6, gemv_nbits::S2>(_param, _config);
-              if (m == 7) gemv_kblock<utils::bf16, 7, gemv_nbits::S2>(_param, _config);
-              if (m == 8) gemv_kblock<utils::bf16, 8, gemv_nbits::S2>(_param, _config);
-            }
-          }
-          return;
         }
       }
     }
@@ -582,20 +531,20 @@ class LauncherBase {
         }
       }
     }
-    mEpilogue.forward(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize, blk_nsize,
-                      _param.paramC, tmpcache, _config.tmpcachesize);
+    Epilogue::template forward<ISA>(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize,
+                                    blk_nsize, _param.paramC, tmpcache, _config.tmpcachesize);
   }
 };
 
 template <BTLA_ISA _RT_ISA_T, class _GemmCore_T, template <class _T, BTLA_ISA> class _PrologueA_T,
-          template <class _T, BTLA_ISA> class _PrologueB_T, template <BTLA_ISA> class _Epilogue_T>
+          template <class _T, BTLA_ISA> class _PrologueB_T, class _Epilogue_T>
 class LauncherIntKBlock {
  public:
   using GemmCore = _GemmCore_T;
   static constexpr BTLA_ISA ISA = _RT_ISA_T;
   using PrologueA = _PrologueA_T<GemmCore, _RT_ISA_T>;
   using PrologueB = _PrologueB_T<GemmCore, _RT_ISA_T>;
-  using Epilogue = _Epilogue_T<_RT_ISA_T>;
+  using Epilogue = _Epilogue_T;
   using AType = typename GemmCore::AType;
   using AParam = typename PrologueA::Param;
   using BType = typename GemmCore::BType;
@@ -613,7 +562,6 @@ class LauncherIntKBlock {
   _GemmCore_T mGemmCore;
   PrologueA mProA;
   PrologueB mProB;
-  Epilogue mEpilogue;
 
   class GEMVWrapper {
    public:
@@ -628,41 +576,44 @@ class LauncherIntKBlock {
       }
       if constexpr (GemmCore::ISA == BTLA_ISA::AVX_VNNI) {
 #if CompileAVXVNNI()
-        static_assert(GemmCore::PACK_ROW == 4);
         if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_INT8_US_FP32) {
+          static_assert(GemmCore::PACK_ROW == 4);
           return true;
         }
         if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_INT8_SS_FP32) {
+          static_assert(GemmCore::PACK_ROW == 4);
           return true;
         }
 #endif
       }
       if constexpr (GemmCore::ISA == BTLA_ISA::AVX2) {
 #if CompileAVX2()
-        static_assert(GemmCore::PACK_ROW == 4);
         if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_INT8_US_FP32) {
+          static_assert(GemmCore::PACK_ROW == 4);
           return true;
         }
         if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_INT8_SS_FP32) {
+          static_assert(GemmCore::PACK_ROW == 4);
           return true;
         }
 #endif
       }
       if constexpr (GemmCore::ISA == BTLA_ISA::AVX512BW) {
 #if CompileAVX512F()
-        static_assert(GemmCore::PACK_ROW == 4);
         if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_INT8_US_FP32) {
+          static_assert(GemmCore::PACK_ROW == 4);
           return true;
         }
 #endif
       }
       if constexpr (GemmCore::ISA == BTLA_ISA::AVX512_VNNI || GemmCore::ISA == BTLA_ISA::AMX_INT8) {
 #if CompileAVX512VNNI()
-        static_assert(GemmCore::PACK_ROW == 4);
         if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_INT8_US_FP32) {
+          static_assert(GemmCore::PACK_ROW == 4);
           return true;
         }
         if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_INT8_SS_FP32) {
+          static_assert(GemmCore::PACK_ROW == 4);
           return true;
         }
 #endif
@@ -687,7 +638,7 @@ class LauncherIntKBlock {
       return impl;
     }
 
-    template <typename ScaleT, int MTILE, class SNbits>
+    template <typename ScaleT, int MTILE>
     static void gemv_kblock(const Param& _param, const parallel::gemm::ThreadProblemBase& _config) {
       if constexpr (support()) {
         auto constexpr TmpSize = 16 * 1024LL;
@@ -696,7 +647,7 @@ class LauncherIntKBlock {
         auto StackTmp_ = alloca(TmpSize + CSize);
         auto StackTmp = utils::cpu_pointer_align<void>(StackTmp_);
         auto tmpc_ptr = reinterpret_cast<CType*>((char*)StackTmp + TmpSize);
-        utils::GemvParamB<ScaleT> paramB = SNbits::template createB<ScaleT>(_param.paramB.packedW);
+        utils::GemvParamB<ScaleT> paramB = gemv_nbits::NBitsHelper::template createB<ScaleT>(_param.paramB.packedW);
         utils::GemvParamA paramA{
             _param.paramA.quan->template APtr<uint8_t>(), _param.paramA.quan->template SPtr<float>(),
             _param.paramA.quan->template ZPtr<uint8_t>(), _param.paramA.quan->mKPad, _param.paramA.quan->CStep()};
@@ -705,7 +656,7 @@ class LauncherIntKBlock {
         int n = _param.problem.dims[2];
         int k = _param.problem.dims[3];
         int kblocksize = _param.problem.dims[4];
-        SNbits::template updateBNStep<ScaleT>(paramB, _config.loc[1]);
+        gemv_nbits::NBitsHelper::template updateBNStep<ScaleT>(paramB, _config.loc[1]);
         int size_padded = utils::padto_le(_config.size[1], GemmCore::NTILE);
         int in = 0;
         for (; in < size_padded; in += GemmCore::NTILE) {
@@ -716,9 +667,9 @@ class LauncherIntKBlock {
             kernel::wrapper::GEMVWoqNBits::forward_s8s8_fp32<_RT_ISA_T, ScaleT, GemmCore::NTILE, MTILE>(
                 paramA, paramB, tmpc_ptr, GemmCore::NTILE, k, kblocksize, StackTmp, TmpSize);
           }
-          Epilogue::forward(tmpc_ptr, GemmCore::NTILE, 0, _config.loc[1] + in, MTILE, GemmCore::NTILE, _param.paramC,
-                            StackTmp, TmpSize);
-          SNbits::template updateBNStep<ScaleT>(paramB, GemmCore::NTILE);
+          Epilogue::template forward<ISA>(tmpc_ptr, GemmCore::NTILE, 0, _config.loc[1] + in, MTILE, GemmCore::NTILE,
+                                          _param.paramC, StackTmp, TmpSize);
+          gemv_nbits::NBitsHelper::template updateBNStep<ScaleT>(paramB, GemmCore::NTILE);
         }
         if (size_padded != _config.size[1]) {
           if constexpr (std::is_same_v<AType, uint8_t>) {
@@ -728,8 +679,8 @@ class LauncherIntKBlock {
             kernel::wrapper::GEMVWoqNBits::forward_s8s8_fp32<_RT_ISA_T, ScaleT, GemmCore::NTILE, MTILE>(
                 paramA, paramB, tmpc_ptr, GemmCore::NTILE, k, kblocksize, StackTmp, TmpSize);
           }
-          Epilogue::forward(tmpc_ptr, GemmCore::NTILE, 0, _config.loc[1] + in, MTILE, (_config.size[1] - in),
-                            _param.paramC, StackTmp, TmpSize);
+          Epilogue::template forward<ISA>(tmpc_ptr, GemmCore::NTILE, 0, _config.loc[1] + in, MTILE,
+                                          (_config.size[1] - in), _param.paramC, StackTmp, TmpSize);
         }
       }
     }
@@ -737,190 +688,28 @@ class LauncherIntKBlock {
     static void gemv(const Param& _param, const parallel::gemm::ThreadProblemBase& _config) {
       if constexpr (support()) {
         auto& m = _param.problem.dims[1];
-        if (_param.paramB.packedW->mDType == BTLA_DTYPE::S4_CLIP) {
-          if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F32) {
-            if (m == 1) gemv_kblock<float, 1, gemv_nbits::S4>(_param, _config);
-            if (m == 2) gemv_kblock<float, 2, gemv_nbits::S4>(_param, _config);
-            if (m == 3) gemv_kblock<float, 3, gemv_nbits::S4>(_param, _config);
-            if (m == 4) gemv_kblock<float, 4, gemv_nbits::S4>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<float, 5, gemv_nbits::S4>(_param, _config);
-              if (m == 6) gemv_kblock<float, 6, gemv_nbits::S4>(_param, _config);
-              if (m == 7) gemv_kblock<float, 7, gemv_nbits::S4>(_param, _config);
-              if (m == 8) gemv_kblock<float, 8, gemv_nbits::S4>(_param, _config);
-            }
-          } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::BF16) {
-            if (m == 1) gemv_kblock<utils::bf16, 1, gemv_nbits::S4>(_param, _config);
-            if (m == 2) gemv_kblock<utils::bf16, 2, gemv_nbits::S4>(_param, _config);
-            if (m == 3) gemv_kblock<utils::bf16, 3, gemv_nbits::S4>(_param, _config);
-            if (m == 4) gemv_kblock<utils::bf16, 4, gemv_nbits::S4>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<utils::bf16, 5, gemv_nbits::S4>(_param, _config);
-              if (m == 6) gemv_kblock<utils::bf16, 6, gemv_nbits::S4>(_param, _config);
-              if (m == 7) gemv_kblock<utils::bf16, 7, gemv_nbits::S4>(_param, _config);
-              if (m == 8) gemv_kblock<utils::bf16, 8, gemv_nbits::S4>(_param, _config);
-            }
+        if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F32) {
+          if (m == 1) gemv_kblock<float, 1>(_param, _config);
+          if (m == 2) gemv_kblock<float, 2>(_param, _config);
+          if (m == 3) gemv_kblock<float, 3>(_param, _config);
+          if (m == 4) gemv_kblock<float, 4>(_param, _config);
+          if constexpr (Reg32) {
+            if (m == 5) gemv_kblock<float, 5>(_param, _config);
+            if (m == 6) gemv_kblock<float, 6>(_param, _config);
+            if (m == 7) gemv_kblock<float, 7>(_param, _config);
+            if (m == 8) gemv_kblock<float, 8>(_param, _config);
           }
-          return;
-        }
-        if (_param.paramB.packedW->mDType == BTLA_DTYPE::S5_CLIP) {
-          if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F32) {
-            if (m == 1) gemv_kblock<float, 1, gemv_nbits::S5>(_param, _config);
-            if (m == 2) gemv_kblock<float, 2, gemv_nbits::S5>(_param, _config);
-            if (m == 3) gemv_kblock<float, 3, gemv_nbits::S5>(_param, _config);
-            if (m == 4) gemv_kblock<float, 4, gemv_nbits::S5>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<float, 5, gemv_nbits::S5>(_param, _config);
-              if (m == 6) gemv_kblock<float, 6, gemv_nbits::S5>(_param, _config);
-              if (m == 7) gemv_kblock<float, 7, gemv_nbits::S5>(_param, _config);
-              if (m == 8) gemv_kblock<float, 8, gemv_nbits::S5>(_param, _config);
-            }
-
-          } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::BF16) {
-            if (m == 1) gemv_kblock<utils::bf16, 1, gemv_nbits::S5>(_param, _config);
-            if (m == 2) gemv_kblock<utils::bf16, 2, gemv_nbits::S5>(_param, _config);
-            if (m == 3) gemv_kblock<utils::bf16, 3, gemv_nbits::S5>(_param, _config);
-            if (m == 4) gemv_kblock<utils::bf16, 4, gemv_nbits::S5>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<utils::bf16, 5, gemv_nbits::S5>(_param, _config);
-              if (m == 6) gemv_kblock<utils::bf16, 6, gemv_nbits::S5>(_param, _config);
-              if (m == 7) gemv_kblock<utils::bf16, 7, gemv_nbits::S5>(_param, _config);
-              if (m == 8) gemv_kblock<utils::bf16, 8, gemv_nbits::S5>(_param, _config);
-            }
-          }
-          return;
-        }
-        if (_param.paramB.packedW->mDType == BTLA_DTYPE::S6_CLIP) {
-          if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F32) {
-            if (m == 1) gemv_kblock<float, 1, gemv_nbits::S6>(_param, _config);
-            if (m == 2) gemv_kblock<float, 2, gemv_nbits::S6>(_param, _config);
-            if (m == 3) gemv_kblock<float, 3, gemv_nbits::S6>(_param, _config);
-            if (m == 4) gemv_kblock<float, 4, gemv_nbits::S6>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<float, 5, gemv_nbits::S6>(_param, _config);
-              if (m == 6) gemv_kblock<float, 6, gemv_nbits::S6>(_param, _config);
-              if (m == 7) gemv_kblock<float, 7, gemv_nbits::S6>(_param, _config);
-              if (m == 8) gemv_kblock<float, 8, gemv_nbits::S6>(_param, _config);
-            }
-
-          } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::BF16) {
-            if (m == 1) gemv_kblock<utils::bf16, 1, gemv_nbits::S6>(_param, _config);
-            if (m == 2) gemv_kblock<utils::bf16, 2, gemv_nbits::S6>(_param, _config);
-            if (m == 3) gemv_kblock<utils::bf16, 3, gemv_nbits::S6>(_param, _config);
-            if (m == 4) gemv_kblock<utils::bf16, 4, gemv_nbits::S6>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<utils::bf16, 5, gemv_nbits::S6>(_param, _config);
-              if (m == 6) gemv_kblock<utils::bf16, 6, gemv_nbits::S6>(_param, _config);
-              if (m == 7) gemv_kblock<utils::bf16, 7, gemv_nbits::S6>(_param, _config);
-              if (m == 8) gemv_kblock<utils::bf16, 8, gemv_nbits::S6>(_param, _config);
-            }
-          }
-          return;
-        }
-        if (_param.paramB.packedW->mDType == BTLA_DTYPE::S7_CLIP) {
-          if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F32) {
-            if (m == 1) gemv_kblock<float, 1, gemv_nbits::S7>(_param, _config);
-            if (m == 2) gemv_kblock<float, 2, gemv_nbits::S7>(_param, _config);
-            if (m == 3) gemv_kblock<float, 3, gemv_nbits::S7>(_param, _config);
-            if (m == 4) gemv_kblock<float, 4, gemv_nbits::S7>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<float, 5, gemv_nbits::S7>(_param, _config);
-              if (m == 6) gemv_kblock<float, 6, gemv_nbits::S7>(_param, _config);
-              if (m == 7) gemv_kblock<float, 7, gemv_nbits::S7>(_param, _config);
-              if (m == 8) gemv_kblock<float, 8, gemv_nbits::S7>(_param, _config);
-            }
-
-          } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::BF16) {
-            if (m == 1) gemv_kblock<utils::bf16, 1, gemv_nbits::S7>(_param, _config);
-            if (m == 2) gemv_kblock<utils::bf16, 2, gemv_nbits::S7>(_param, _config);
-            if (m == 3) gemv_kblock<utils::bf16, 3, gemv_nbits::S7>(_param, _config);
-            if (m == 4) gemv_kblock<utils::bf16, 4, gemv_nbits::S7>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<utils::bf16, 5, gemv_nbits::S7>(_param, _config);
-              if (m == 6) gemv_kblock<utils::bf16, 6, gemv_nbits::S7>(_param, _config);
-              if (m == 7) gemv_kblock<utils::bf16, 7, gemv_nbits::S7>(_param, _config);
-              if (m == 8) gemv_kblock<utils::bf16, 8, gemv_nbits::S7>(_param, _config);
-            }
-          }
-          return;
-        }
-        if (_param.paramB.packedW->mDType == BTLA_DTYPE::S3_CLIP) {
-          if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F32) {
-            if (m == 1) gemv_kblock<float, 1, gemv_nbits::S3>(_param, _config);
-            if (m == 2) gemv_kblock<float, 2, gemv_nbits::S3>(_param, _config);
-            if (m == 3) gemv_kblock<float, 3, gemv_nbits::S3>(_param, _config);
-            if (m == 4) gemv_kblock<float, 4, gemv_nbits::S3>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<float, 5, gemv_nbits::S3>(_param, _config);
-              if (m == 6) gemv_kblock<float, 6, gemv_nbits::S3>(_param, _config);
-              if (m == 7) gemv_kblock<float, 7, gemv_nbits::S3>(_param, _config);
-              if (m == 8) gemv_kblock<float, 8, gemv_nbits::S3>(_param, _config);
-            }
-          } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::BF16) {
-            if (m == 1) gemv_kblock<utils::bf16, 1, gemv_nbits::S3>(_param, _config);
-            if (m == 2) gemv_kblock<utils::bf16, 2, gemv_nbits::S3>(_param, _config);
-            if (m == 3) gemv_kblock<utils::bf16, 3, gemv_nbits::S3>(_param, _config);
-            if (m == 4) gemv_kblock<utils::bf16, 4, gemv_nbits::S3>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<utils::bf16, 5, gemv_nbits::S3>(_param, _config);
-              if (m == 6) gemv_kblock<utils::bf16, 6, gemv_nbits::S3>(_param, _config);
-              if (m == 7) gemv_kblock<utils::bf16, 7, gemv_nbits::S3>(_param, _config);
-              if (m == 8) gemv_kblock<utils::bf16, 8, gemv_nbits::S3>(_param, _config);
-            }
-          }
-          return;
-        }
-        if (_param.paramB.packedW->mDType == BTLA_DTYPE::S1_CLIP) {
-          if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F32) {
-            if (m == 1) gemv_kblock<float, 1, gemv_nbits::S1>(_param, _config);
-            if (m == 2) gemv_kblock<float, 2, gemv_nbits::S1>(_param, _config);
-            if (m == 3) gemv_kblock<float, 3, gemv_nbits::S1>(_param, _config);
-            if (m == 4) gemv_kblock<float, 4, gemv_nbits::S1>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<float, 5, gemv_nbits::S1>(_param, _config);
-              if (m == 6) gemv_kblock<float, 6, gemv_nbits::S1>(_param, _config);
-              if (m == 7) gemv_kblock<float, 7, gemv_nbits::S1>(_param, _config);
-              if (m == 8) gemv_kblock<float, 8, gemv_nbits::S1>(_param, _config);
-            }
-          } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::BF16) {
-            if (m == 1) gemv_kblock<utils::bf16, 1, gemv_nbits::S1>(_param, _config);
-            if (m == 2) gemv_kblock<utils::bf16, 2, gemv_nbits::S1>(_param, _config);
-            if (m == 3) gemv_kblock<utils::bf16, 3, gemv_nbits::S1>(_param, _config);
-            if (m == 4) gemv_kblock<utils::bf16, 4, gemv_nbits::S1>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<utils::bf16, 5, gemv_nbits::S1>(_param, _config);
-              if (m == 6) gemv_kblock<utils::bf16, 6, gemv_nbits::S1>(_param, _config);
-              if (m == 7) gemv_kblock<utils::bf16, 7, gemv_nbits::S1>(_param, _config);
-              if (m == 8) gemv_kblock<utils::bf16, 8, gemv_nbits::S1>(_param, _config);
-            }
-          }
-          return;
-        }
-        if (_param.paramB.packedW->mDType == BTLA_DTYPE::S2_CLIP) {
-          if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F32) {
-            if (m == 1) gemv_kblock<float, 1, gemv_nbits::S2>(_param, _config);
-            if (m == 2) gemv_kblock<float, 2, gemv_nbits::S2>(_param, _config);
-            if (m == 3) gemv_kblock<float, 3, gemv_nbits::S2>(_param, _config);
-            if (m == 4) gemv_kblock<float, 4, gemv_nbits::S2>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<float, 5, gemv_nbits::S2>(_param, _config);
-              if (m == 6) gemv_kblock<float, 6, gemv_nbits::S2>(_param, _config);
-              if (m == 7) gemv_kblock<float, 7, gemv_nbits::S2>(_param, _config);
-              if (m == 8) gemv_kblock<float, 8, gemv_nbits::S2>(_param, _config);
-            }
-          } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::BF16) {
-            if (m == 1) gemv_kblock<utils::bf16, 1, gemv_nbits::S2>(_param, _config);
-            if (m == 2) gemv_kblock<utils::bf16, 2, gemv_nbits::S2>(_param, _config);
-            if (m == 3) gemv_kblock<utils::bf16, 3, gemv_nbits::S2>(_param, _config);
-            if (m == 4) gemv_kblock<utils::bf16, 4, gemv_nbits::S2>(_param, _config);
-            if constexpr (Reg32) {
-              if (m == 5) gemv_kblock<utils::bf16, 5, gemv_nbits::S2>(_param, _config);
-              if (m == 6) gemv_kblock<utils::bf16, 6, gemv_nbits::S2>(_param, _config);
-              if (m == 7) gemv_kblock<utils::bf16, 7, gemv_nbits::S2>(_param, _config);
-              if (m == 8) gemv_kblock<utils::bf16, 8, gemv_nbits::S2>(_param, _config);
-            }
+        } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::BF16) {
+          if (m == 1) gemv_kblock<utils::bf16, 1>(_param, _config);
+          if (m == 2) gemv_kblock<utils::bf16, 2>(_param, _config);
+          if (m == 3) gemv_kblock<utils::bf16, 3>(_param, _config);
+          if (m == 4) gemv_kblock<utils::bf16, 4>(_param, _config);
+          if constexpr (Reg32) {
+            if (m == 5) gemv_kblock<utils::bf16, 5>(_param, _config);
+            if (m == 6) gemv_kblock<utils::bf16, 6>(_param, _config);
+            if (m == 7) gemv_kblock<utils::bf16, 7>(_param, _config);
+            if (m == 8) gemv_kblock<utils::bf16, 8>(_param, _config);
           }
-          return;
         }
       }
     }
@@ -1023,8 +812,8 @@ class LauncherIntKBlock {
                           bcache_stride, ccache_stride, iterk, 1.f, tmp_, _config.tmpcachesize);
       }
     }
-    mEpilogue.forward(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize, blk_nsize,
-                      _param.paramC, tmpcache, _config.tmpcachesize);
+    Epilogue::template forward<ISA>(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize,
+                                    blk_nsize, _param.paramC, tmpcache, _config.tmpcachesize);
   }
 
   // _config.block[2]<kblock
@@ -1093,8 +882,8 @@ class LauncherIntKBlock {
         }
       }
     }
-    mEpilogue.forward(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize, blk_nsize,
-                      _param.paramC, tmpcache, _config.tmpcachesize);
+    Epilogue::template forward<ISA>(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize,
+                                    blk_nsize, _param.paramC, tmpcache, _config.tmpcachesize);
   }
 };
 }  // namespace gemm
diff --git a/bestla/bestla/kernel_avx2.h b/bestla/bestla/kernel_avx2.h
index 361577d73..29110cf0d 100644
--- a/bestla/bestla/kernel_avx2.h
+++ b/bestla/bestla/kernel_avx2.h
@@ -6634,10 +6634,57 @@ static inline BTLA_CODE gemv_7bit_s8s8_fp32(const utils::GemvParamA& A, const ut
 #endif
 }  // namespace vnni
 
+template <typename T>
+static inline BTLA_CODE mul(const T* src0ptr, const T* src1ptr, T* dstptr, size_t size) {
+  int constexpr VLen = 8;
+  size_t velt = utils::padto_le(size, VLen);
+  size_t i = 0;
+  auto vfunc = [&]() {
+    auto v0 = load_T_fp32(src0ptr + i);
+    auto v1 = load_T_fp32(src1ptr + i);
+    auto out = _mm256_mul_ps(v0, v1);
+    store_fp_T(out, dstptr + i);
+  };
+  for (; i < velt; i += VLen) vfunc();
+  if (i < size) {
+    if (size >= VLen) {
+      i = size - VLen;
+      vfunc();
+    } else {
+      ref::mul(src0ptr + i, src1ptr + i, dstptr + i, size - i);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename T>
+static inline BTLA_CODE add(const T* src0ptr, const T* src1ptr, T* dstptr, size_t size) {
+  int constexpr VLen = 8;
+  size_t velt = utils::padto_le(size, VLen);
+  size_t i = 0;
+  auto vfunc = [&]() {
+    auto v0 = load_T_fp32(src0ptr + i);
+    auto v1 = load_T_fp32(src1ptr + i);
+    auto out = _mm256_add_ps(v0, v1);
+    store_fp_T(out, dstptr + i);
+  };
+  for (; i < velt; i += VLen) vfunc();
+  if (i < size) {
+    if (size >= VLen) {
+      i = size - VLen;
+      vfunc();
+    } else {
+      ref::add(src0ptr + i, src1ptr + i, dstptr + i, size - i);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
 #ifdef __GNUC__
 #pragma GCC pop_options
 #else
 #endif
+
 #endif
 }  // namespace avx2
 }  // namespace kernel
diff --git a/bestla/bestla/kernel_avx512f.h b/bestla/bestla/kernel_avx512f.h
index 590024f93..eef6c96fc 100644
--- a/bestla/bestla/kernel_avx512f.h
+++ b/bestla/bestla/kernel_avx512f.h
@@ -7784,6 +7784,51 @@ static inline BTLA_CODE gemv_7bit_s8s8_fp32(const utils::GemvParamA& A, const ut
 #endif
 }  // namespace vnni
 
+template <typename T>
+static inline BTLA_CODE mul(const T* src0ptr, const T* src1ptr, T* dstptr, size_t size) {
+  int constexpr VLen = 16;
+  size_t velt = utils::padto_le(size, VLen);
+  size_t i = 0;
+  auto vfunc = [&]() {
+    auto v0 = load_T_fp32(src0ptr + i);
+    auto v1 = load_T_fp32(src1ptr + i);
+    auto out = _mm512_mul_ps(v0, v1);
+    store_fp_T(out, dstptr + i);
+  };
+  for (; i < velt; i += VLen) vfunc();
+  if (i < size) {
+    if (size >= VLen) {
+      i = size - VLen;
+      vfunc();
+    } else {
+      ref::mul(src0ptr + i, src1ptr + i, dstptr + i, size - i);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename T>
+static inline BTLA_CODE add(const T* src0ptr, const T* src1ptr, T* dstptr, size_t size) {
+  int constexpr VLen = 16;
+  size_t velt = utils::padto_le(size, VLen);
+  size_t i = 0;
+  auto vfunc = [&]() {
+    auto v0 = load_T_fp32(src0ptr + i);
+    auto v1 = load_T_fp32(src1ptr + i);
+    auto out = _mm512_add_ps(v0, v1);
+    store_fp_T(out, dstptr + i);
+  };
+  for (; i < velt; i += VLen) vfunc();
+  if (i < size) {
+    if (size >= VLen) {
+      i = size - VLen;
+      vfunc();
+    } else {
+      ref::add(src0ptr + i, src1ptr + i, dstptr + i, size - i);
+    }
+  }
+  return BTLA_CODE::Success;
+}
 #ifdef __GNUC__
 #pragma GCC pop_options
 #else
diff --git a/bestla/bestla/kernel_ref.h b/bestla/bestla/kernel_ref.h
index 43373c6c3..eb04a2d8d 100644
--- a/bestla/bestla/kernel_ref.h
+++ b/bestla/bestla/kernel_ref.h
@@ -3392,6 +3392,23 @@ static inline BTLA_CODE gemv_7bit_s8s8_fp32(const utils::GemvParamA& A, const ut
   return BTLA_CODE::Success;
 }
 
+template <typename T>
+static inline BTLA_CODE mul(const T* src0ptr, const T* src1ptr, T* dstptr, size_t size) {
+  for (size_t i = 0; i < size; i++) {
+    float tmp = float(src0ptr[i]) * float(src1ptr[i]);
+    dstptr[i] = tmp;
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename T>
+static inline BTLA_CODE add(const T* src0ptr, const T* src1ptr, T* dstptr, size_t size) {
+  for (size_t i = 0; i < size; i++) {
+    float tmp = float(src0ptr[i]) + float(src1ptr[i]);
+    dstptr[i] = tmp;
+  }
+  return BTLA_CODE::Success;
+}
 }  // namespace ref
 }  // namespace kernel
 }  // namespace bestla
diff --git a/bestla/bestla/kernel_wrapper.h b/bestla/bestla/kernel_wrapper.h
index 3b65e6cb5..4491e6515 100644
--- a/bestla/bestla/kernel_wrapper.h
+++ b/bestla/bestla/kernel_wrapper.h
@@ -1558,6 +1558,66 @@ class GEMVWoqNBits {
   }
 };
 
+template <typename T>
+class Mul {
+ public:
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward(const T* src0ptr, const T* src1ptr, T* dstptr, size_t size) {
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::mul(src0ptr, src1ptr, dstptr, size);
+    }
+#endif
+#if CompileAVX2()
+    if constexpr (utils::isa_base<ISA_T>::avx2) {
+      return avx2::mul(src0ptr, src1ptr, dstptr, size);
+    }
+#endif
+    return ref::mul(src0ptr, src1ptr, dstptr, size);
+  }
+
+  static inline BTLA_CODE forward_auto(const T* src0ptr, const T* src1ptr, T* dstptr, size_t size) {
+    GetCPUDevice();
+    if (_cd->AVX512F()) {
+      return forward<BTLA_ISA::AVX512F>(src0ptr, src1ptr, dstptr, size);
+    }
+    if (_cd->AVX2()) {
+      return forward<BTLA_ISA::AVX2>(src0ptr, src1ptr, dstptr, size);
+    }
+    return forward<BTLA_ISA::NoSIMD>(src0ptr, src1ptr, dstptr, size);
+  }
+};
+
+template <typename T>
+class Add {
+ public:
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward(const T* src0ptr, const T* src1ptr, T* dstptr, size_t size) {
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::add(src0ptr, src1ptr, dstptr, size);
+    }
+#endif
+#if CompileAVX2()
+    if constexpr (utils::isa_base<ISA_T>::avx2) {
+      return avx2::add(src0ptr, src1ptr, dstptr, size);
+    }
+#endif
+    return ref::add(src0ptr, src1ptr, dstptr, size);
+  }
+
+  static inline BTLA_CODE forward_auto(const T* src0ptr, const T* src1ptr, T* dstptr, size_t size) {
+    GetCPUDevice();
+    if (_cd->AVX512F()) {
+      return forward<BTLA_ISA::AVX512F>(src0ptr, src1ptr, dstptr, size);
+    }
+    if (_cd->AVX2()) {
+      return forward<BTLA_ISA::AVX2>(src0ptr, src1ptr, dstptr, size);
+    }
+    return forward<BTLA_ISA::NoSIMD>(src0ptr, src1ptr, dstptr, size);
+  }
+};
+
 }  // namespace wrapper
 }  // namespace kernel
 }  // namespace bestla
diff --git a/bestla/bestla/ut/bestla_benchmark.cpp b/bestla/bestla/ut/bestla_benchmark.cpp
index 24a952dbe..8d62b1fac 100644
--- a/bestla/bestla/ut/bestla_benchmark.cpp
+++ b/bestla/bestla/ut/bestla_benchmark.cpp
@@ -1,8 +1,7 @@
 #include <stdio.h>
 #include "bestla_wrapper.h"
 #include "bestla_ut.h"
-#undef BTLA_UT_WRAPPER
-#undef BTLA_UT_PROLOGUE_B
+
 namespace bestla {
 using namespace utils;
 namespace ut {
@@ -13,7 +12,6 @@ class Benchmark_Fp32Fp32 {
     UT_START();
     benchmark_all(1, 4096, 4096);
     benchmark_all(1024, 4096, 4096);
-    benchmark_all(2048, 4096, 4096);
   }
 
   using AType = float;
@@ -78,10 +76,10 @@ class Benchmark_Fp32Fp32 {
     auto threads_cfg = UT_Threading::get_threads_config();
     for (auto threads : threads_cfg) {
       if (_cd->AVX512F()) {
-        benchmark<sAVX512F, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime, threads);
+        benchmark<gemm::SCoreRowNAvx512f<48, 8>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime, threads);
       }
       if (_cd->AVX2()) {
-        benchmark<sAVX2, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime, threads);
+        benchmark<gemm::SCoreRowNAvx2<24, 4>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime, threads);
       }
     }
   }
@@ -96,7 +94,6 @@ class Benchmark_U8S8S32 {
     UT_START();
     benchmark_all(1, 4096, 4096);
     benchmark_all(1024, 4096, 4096);
-    benchmark_all(2048, 4096, 4096);
   }
 
   using AType = uint8_t;
@@ -166,8 +163,6 @@ class Benchmark_U8S8S32 {
         benchmark<gemm::ICoreRowNAmxint8<64, 16>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime, threads);
       }
       if (_cd->AVX512_VNNI()) {
-        benchmark<gemm::ICoreRowNAvx512vnni<64, 6>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime,
-                                                         threads);
         benchmark<gemm::ICoreRowNAvx512vnni<48, 8>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime,
                                                          threads);
       }
@@ -177,20 +172,22 @@ class Benchmark_U8S8S32 {
       if (_cd->AVX_VNNI()) {
         benchmark<gemm::ICoreRowNAvxvnni<24, 4>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime, threads);
       }
+      if (_cd->AVX2()) {
+        benchmark<gemm::ICoreRowNAvx2vnni<24, 4>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime, threads);
+      }
     }
   }
 };
 #ifdef BTLA_UT_WRAPPER
-#endif
 static Benchmark_U8S8S32 sBenchmark_U8S8S32;
+#endif
 
 class Benchmark_S8S8S32 {
  public:
   Benchmark_S8S8S32() {
     UT_START();
-    // benchmark_all(1, 4096, 4096);
+    benchmark_all(1, 4096, 4096);
     benchmark_all(1024, 4096, 4096);
-    // benchmark_all(2048, 4096, 4096);
   }
 
   using AType = int8_t;
@@ -254,10 +251,14 @@ class Benchmark_S8S8S32 {
     GetCPUDevice();
     auto threads_cfg = UT_Threading::get_threads_config();
     for (auto threads : threads_cfg) {
+      if (_cd->AVX2()) {
+        benchmark<gemm::ICoreRowNAvx2vnniSS<24, 2>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime,
+                                                         threads);
+      }
       if (_cd->AVX_VNNI()) {
         benchmark<gemm::ICoreRowNAvxvnniSS<24, 4>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime,
                                                         threads);
-        benchmark<gemm::ICoreRowNAvxvnniSS<24, 2>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime,
+        benchmark<gemm::ICoreRowNAvxvnniSS<24, 3>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime,
                                                         threads);
       }
       if (_cd->AMX_INT8()) {
@@ -281,7 +282,6 @@ class Benchmark_Bf16Bf16Fp32 {
     UT_START();
     benchmark_all(1, 4096, 4096);
     benchmark_all(1024, 4096, 4096);
-    benchmark_all(2048, 4096, 4096);
   }
 
   using AType = utils::bf16;
@@ -363,7 +363,6 @@ class Benchmark_Fp16Fp16Fp16 {
     UT_START();
     benchmark_all(1, 4096, 4096);
     benchmark_all(1024, 4096, 4096);
-    benchmark_all(2048, 4096, 4096);
   }
 
   using AType = utils::fp16;
@@ -443,57 +442,42 @@ class UTWOQ_CompFp32 {
  public:
   UTWOQ_CompFp32() {
     UT_START();
-    ut_s1();
-    ut_s7();
-    ut_s6();
-    /*ut_s5();
-    ut_s2();
-    ut_s4();
-    ut_s3();*/
-    // ut_s8();
-    // ut_f4();
-  }
-  void ut_s1() {
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S1_CLIP);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S1_CLIP);
-  }
-  void ut_s2() {
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S2_CLIP);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S2_CLIP);
-  }
-  void ut_s3() {
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, BTLA_DTYPE::S3_CLIP);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1024, 4096, 4096, BTLA_DTYPE::S3_CLIP);
-  }
-  void ut_s4() {
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S4_CLIP);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S4_CLIP);
-  }
-  void ut_s5() {
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S5_CLIP);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S5_CLIP);
-  }
-  void ut_s6() {
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S6_CLIP);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S6_CLIP);
+    ut_s4_full();
+    ut_new_dtype(BTLA_DTYPE::S1_CLIP);
+    ut_new_dtype(BTLA_DTYPE::S2_CLIP);
+    ut_new_dtype(BTLA_DTYPE::S3_CLIP);
+    ut_new_dtype(BTLA_DTYPE::S5_CLIP);
+    ut_new_dtype(BTLA_DTYPE::S6_CLIP);
+    ut_new_dtype(BTLA_DTYPE::S7_CLIP);
+    ut_new_dtype(BTLA_DTYPE::S8);
+    ut_f4();
   }
-  void ut_s7() {
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S7_CLIP);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S7_CLIP);
+
+  void ut_new_dtype(BTLA_DTYPE qtype) {
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, qtype, true);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, qtype);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, qtype, true);
   }
-  void ut_s8() {
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S8);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S8);
+
+  void ut_s4_full() {
+    BTLA_DTYPE qtype = BTLA_DTYPE::S4_CLIP;
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, qtype, true);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, qtype);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, qtype, true);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, qtype);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, qtype);
   }
 
   void ut_f4() {
     benchmark_all<prologue_b::gemm::WeightKBlockNFloat, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::F4_BNB);
+    benchmark_all<prologue_b::gemm::WeightKBlockNFloat, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::F4_E2M1);
+    benchmark_all<prologue_b::gemm::WeightKBlockNFloat, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::F4_NF4);
     benchmark_all<prologue_b::gemm::WeightKBlockNFloat, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::F4_BNB);
   }
 
   template <typename Core_T, typename LOG_T, template <class _T, BTLA_ISA> class Wei, typename Scale_T>
   void benchmark(int m, int n, int k, int batch, int blocksize, float* A, float* B, float* C, float timems, int threads,
-                 BTLA_DTYPE qtype) {
+                 BTLA_DTYPE qtype, bool isasym) {
     LOG_T log;
     using Parallel = parallel::gemm::SchedulerBase<Core_T>;
     using Launcher = wrapper::gemm::LauncherBase<Core_T::ISA, Core_T, prologue_a::gemm::ActivationBase, Wei,
@@ -506,7 +490,7 @@ class UTWOQ_CompFp32 {
     WType tmpB(0);
     if constexpr (std::is_same_v<Wei<Core_T, Core_T::ISA>,
                                  prologue_b::gemm::WeightKBlockNInteger<Core_T, Core_T::ISA>>) {
-      tmpB = kernel.mProB.createStorage(n, k, blocksize, qtype, bestla_dtype<Scale_T>, bestla_dtype<float>, false);
+      tmpB = kernel.mProB.createStorage(n, k, blocksize, qtype, bestla_dtype<Scale_T>, bestla_dtype<float>, isasym);
 
     } else if constexpr (std::is_same_v<Wei<Core_T, Core_T::ISA>,
                                         prologue_b::gemm::WeightKBlockNFloat<Core_T, Core_T::ISA>>) {
@@ -524,7 +508,9 @@ class UTWOQ_CompFp32 {
       memcpy(packBs[i].template SPtr<void>(), packBs[0].template SPtr<void>(), packBs[0].CSize() * sizeof(Scale_T));
     }
     auto psize = (size_t)m * n * k * 2;
-    auto memsize = (size_t)packBs[0].mSize + (m * k + m * n) * sizeof(float);
+    int blks = k / blocksize;
+    int nbits = utils::bestla_dtype_bits(qtype);
+    auto memsize = (size_t)(n * k * nbits / 8 + n * blks * sizeof(Scale_T)) + (m * k + m * n) * sizeof(float);
     tm.start();
     while (tm.stop() < timems) {
       for (int i = 0; i < batch; i++) {
@@ -541,16 +527,17 @@ class UTWOQ_CompFp32 {
     log.record();
     double flops = double(psize) / log.min_val / 1e6;
     double band = double(memsize) / log.min_val / 1e6;
+    int cores = std::min(threads, device::CpuDevice::getInstance()->getCores());
     printf("Threads %d Block %d %s %s Flops:%.3fG PerCoreFlops:%.3fG MemoryBandwidth:%.3fGB/s\n", threads, blocksize,
-           corestr, log.get_log_str(), flops, flops / threads, band);
+           corestr, log.get_log_str(), flops, flops / cores, band);
   }
 
   template <template <class _T, BTLA_ISA> class Wei, typename Scale_T>
-  void benchmark_all(int m, int n, int k, BTLA_DTYPE qtype) {
+  void benchmark_all(int m, int n, int k, BTLA_DTYPE qtype, bool isasym = false) {
     auto memsize = gemm_memsize(m, n, k, BTLA_DTYPE::F32, qtype, BTLA_DTYPE::F32);
     int batch = auto_batch(memsize);
-    printf("%d %d %d %d %s %s %s\n", m, n, k, batch, bestla_dtype_str(BTLA_DTYPE::F32), bestla_dtype_str(qtype),
-           bestla_dtype_str(BTLA_DTYPE::F32));
+    printf("%d %d %d %d %s %s %s isasym:%d\n", m, n, k, batch, bestla_dtype_str(BTLA_DTYPE::F32),
+           bestla_dtype_str(qtype), bestla_dtype_str(BTLA_DTYPE::F32), isasym);
     avector<float> A(size_t(m) * k * batch);
     avector<float> B(size_t(k) * n);
     avector<float> C(size_t(m) * n * batch);
@@ -567,11 +554,11 @@ class UTWOQ_CompFp32 {
       for (auto blocksize : {32, 128}) {
         if (_cd->AVX512F()) {
           benchmark<gemm::SCoreRowNAvx512f<48, 8>, LOG, Wei, Scale_T>(m, n, k, batch, blocksize, A.data(), B.data(),
-                                                                      C.data(), testtime, threads, qtype);
+                                                                      C.data(), testtime, threads, qtype, isasym);
         }
         if (_cd->AVX2()) {
           benchmark<gemm::SCoreRowNAvx2<24, 4>, LOG, Wei, Scale_T>(m, n, k, batch, blocksize, A.data(), B.data(),
-                                                                   C.data(), testtime, threads, qtype);
+                                                                   C.data(), testtime, threads, qtype, isasym);
         }
       }
     }
@@ -641,7 +628,9 @@ class UTWOQ_CompBf16 {
       memcpy(packBs[i].template SPtr<void>(), packBs[0].template SPtr<void>(), packBs[0].CSize() * sizeof(Scale_T));
     }
     auto psize = (size_t)m * n * k * 2;
-    auto memsize = (size_t)packBs[0].mSize + (m * k + m * n) * sizeof(float);
+    int blks = k / blocksize;
+    int nbits = utils::bestla_dtype_bits(qtype);
+    auto memsize = (size_t)(n * k * nbits / 8 + n * blks * sizeof(Scale_T)) + (m * k + m * n) * sizeof(float);
     tm.start();
     while (tm.stop() < timems) {
       for (int i = 0; i < batch; i++) {
@@ -658,8 +647,9 @@ class UTWOQ_CompBf16 {
     log.record();
     double flops = double(psize) / log.min_val / 1e6;
     double band = double(memsize) / log.min_val / 1e6;
+    int cores = std::min(threads, device::CpuDevice::getInstance()->getCores());
     printf("Threads %d Block %d %s %s Flops:%.3fG PerCoreFlops:%.3fG MemoryBandwidth:%.3fGB/s\n", threads, blocksize,
-           corestr, log.get_log_str(), flops, flops / threads, band);
+           corestr, log.get_log_str(), flops, flops / cores, band);
   }
 
   template <template <class _T, BTLA_ISA> class Wei, typename Scale_T>
@@ -698,91 +688,40 @@ class UTWOQ_CompInt8 {
  public:
   UTWOQ_CompInt8() {
     UT_START();
-    ut_s1();
-    ut_s7();
-    ut_s6();
-    /*ut_s5();
-    ut_s2();
-    ut_s4();
-    ut_s3();*/
-    //   ut_s8();
-  }
-  void ut_s1() {
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S1_CLIP);
-    // benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, BTLA_DTYPE::S1_CLIP, true);
-    // benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, BTLA_DTYPE::S1_CLIP);
-    // benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S1_CLIP);
-    /*
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(2048, 4096, 4096, BTLA_DTYPE::S1_CLIP);*/
-  }
-
-  void ut_s2() {
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, BTLA_DTYPE::S2_CLIP, true);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, BTLA_DTYPE::S2_CLIP);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S2_CLIP);
-    /*benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S4_CLIP);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(2048, 4096, 4096, BTLA_DTYPE::S4_CLIP);*/
+    ut_s4_full();
+    ut_new_dtype(BTLA_DTYPE::S1_CLIP);
+    ut_new_dtype(BTLA_DTYPE::S2_CLIP);
+    ut_new_dtype(BTLA_DTYPE::S3_CLIP);
+    ut_new_dtype(BTLA_DTYPE::S5_CLIP);
+    ut_new_dtype(BTLA_DTYPE::S6_CLIP);
+    ut_new_dtype(BTLA_DTYPE::S7_CLIP);
+    ut_new_dtype(BTLA_DTYPE::S8);
   }
-
-  void ut_s3() {
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, BTLA_DTYPE::S3_CLIP);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, BTLA_DTYPE::S3_CLIP, true);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S3_CLIP);
-
-    // benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S3_CLIP);
-    // benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(2048, 4096, 4096, BTLA_DTYPE::S3_CLIP);
+  void ut_new_dtype(BTLA_DTYPE qtype) {
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, qtype, true);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, qtype);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, qtype, true);
   }
 
-  void ut_s4() {
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, BTLA_DTYPE::S4_CLIP);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, BTLA_DTYPE::S4_CLIP, true);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S4_CLIP);
-    // benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S4_CLIP, true);
-    // benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S4_CLIP);
-    // benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(2048, 4096, 4096, BTLA_DTYPE::S4_CLIP);
+  void ut_s4_full() {
+    BTLA_DTYPE qtype = BTLA_DTYPE::S4_CLIP;
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, qtype);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, qtype);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, qtype, true);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(4, 4096, 4096, qtype);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(8, 4096, 4096, qtype);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, qtype);
   }
 
-  void ut_s5() {
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, BTLA_DTYPE::S5_CLIP);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, BTLA_DTYPE::S5_CLIP, true);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S5_CLIP);
-    // benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S5_CLIP, true);
-    // benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S5_CLIP);
-    // benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(2048, 4096, 4096, BTLA_DTYPE::S5_CLIP);
-  }
-
-  void ut_s6() {
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, BTLA_DTYPE::S6_CLIP);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, BTLA_DTYPE::S6_CLIP, true);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S6_CLIP);
-    // benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S6_CLIP, true);
-    // benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S6_CLIP);
-    // benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(2048, 4096, 4096, BTLA_DTYPE::S6_CLIP);
-  }
-
-  void ut_s7() {
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, BTLA_DTYPE::S7_CLIP);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, BTLA_DTYPE::S7_CLIP, true);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S7_CLIP);
-    // benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S7_CLIP, true);
-    // benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S7_CLIP);
-    // benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(2048, 4096, 4096, BTLA_DTYPE::S7_CLIP);
-  }
-
-  void ut_s8() {
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S8);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S8);
-    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(2048, 4096, 4096, BTLA_DTYPE::S8);
-  }
+  using PcWriteBack = epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue::gemm::AccumulatorWriteBackFp32>;
 
   template <typename Core_T, typename LOG_T, template <class _T, BTLA_ISA> class Wei, typename Scale_T>
-  void benchmark(int m, int n, int k, int batch, int blocksize, float* A, float* B, float* C, float timems, int threads,
-                 BTLA_DTYPE qtype, bool isasym) {
+  void benchmark_pc(int m, int n, int k, int batch, int blocksize, float* A, float* B, float* C, float timems,
+                    int threads, BTLA_DTYPE qtype, bool isasym) {
     LOG_T log;
-    using Parallel = parallel::gemm::SchedulerKBlockS<Core_T>;
-    using Launcher =
-        wrapper::gemm::LauncherIntKBlock<Core_T::ISA, Core_T, prologue_a::gemm::ActivationF32KBlockQuantize, Wei,
-                                         epilogue::gemm::AccumulatorWriteBackFp32>;
+    using Parallel = parallel::gemm::SchedulerBase<Core_T>;
+    using Launcher = wrapper::gemm::LauncherBase<Core_T::ISA, Core_T, prologue_a::gemm::ActivationF32KBlockQuantize,
+                                                 Wei, PcWriteBack>;
     Launcher kernel;
     UT_Threading::set_threads(threads);
     auto corestr = gemm::CoreAttr::to_str(Core_T::ID);
@@ -815,7 +754,13 @@ class UTWOQ_CompInt8 {
       for (int i = 0; i < batch; i++) {
         log.start();
         GemmProblem gp(1, m, n, k, blocksize);
-        typename Launcher::Param args{gp, {A + i * m * k, k, &quanA}, {&packBs[i]}, {C + i * m * n, n}};
+        typename Launcher::Param args{
+            gp,
+            {A + i * m * k, k, &quanA},
+            {&packBs[i]},
+            {{packBs[i].template SPtr<char>(), packBs[i].SDtype(), quanA.template SPtr<float>(),
+              quanA.template ZPtr<uint8_t>(), packBs[i].template RPtr<char>(), packBs[i].RDtype(), nullptr, nullptr, k},
+             {C + i * m * n, n}}};
         parallel::GemmRunWithA<Parallel>(kernel, args, UT_Threading::get());
         log.stop();
         if (tm.stop() >= timems) {
@@ -826,342 +771,14 @@ class UTWOQ_CompInt8 {
     log.record();
     double flops = double(psize) / log.min_val / 1e6;
     double band = double(memsize) / log.min_val / 1e6;
+    int cores = std::min(threads, device::CpuDevice::getInstance()->getCores());
     printf("Threads %d Block %d %s %s Flops:%.3fG PerCoreFlops:%.3fG MemoryBandwidth:%.3fGB/s\n", threads, blocksize,
-           corestr, log.get_log_str(), flops, flops / threads, band);
+           corestr, log.get_log_str(), flops, flops / cores, band);
   }
 
-  template <template <class _T, BTLA_ISA> class Wei, typename Scale_T>
-  void benchmark_all(int m, int n, int k, BTLA_DTYPE qtype, bool isasym = false) {
-    auto memsize = gemm_memsize(m, n, k, BTLA_DTYPE::F32, qtype, BTLA_DTYPE::F32);
-    int batch = auto_batch(memsize);
-    printf("%d %d %d %d %s %s %s Asym:%d Scale:%s\n", m, n, k, batch, bestla_dtype_str(BTLA_DTYPE::F32),
-           bestla_dtype_str(qtype), bestla_dtype_str(BTLA_DTYPE::F32), isasym, bestla_dtype_str(bestla_dtype<Scale_T>));
-    avector<float> A(size_t(m) * k * batch);
-    avector<float> B(size_t(k) * n);
-    avector<float> C(size_t(m) * n * batch);
-    fill_buffer_randn(A.data(), k * m, (-0.5f), (0.5f));
-    fill_buffer_randn(B.data(), k * n, (-0.5f), (0.5f));
-    for (int i = 1; i < batch; i++) {
-      memcpy(A.data() + i * m * k, A.data(), m * k * sizeof(float));
-    }
-    using LOG = timer_statistics_logger<TestMs / 2>;
-    float testtime = float(TestMs);
-    GetCPUDevice();
-    auto threads_cfg = UT_Threading::get_threads_config();
-    for (auto threads : threads_cfg) {
-      for (auto blocksize : {32, 128}) {
-        if (_cd->AMX_INT8() && blocksize % 64 == 0) {
-          benchmark<gemm::ICoreRowNAmxint8KBlock<64, 16>, LOG, Wei, Scale_T>(
-              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype, isasym);
-        }
-        if (_cd->AVX512_VNNI()) {
-          benchmark<gemm::ICoreRowNAvx512vnniKBlock<48, 4>, LOG, Wei, Scale_T>(
-              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype, isasym);
-        }
-        if (_cd->AVX512BW()) {
-          benchmark<gemm::ICoreRowNAvx512bwKBlock<48, 8>, LOG, Wei, Scale_T>(
-              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype, isasym);
-        }
-        if (_cd->AVX_VNNI()) {
-          benchmark<gemm::ICoreRowNAvxvnniKBlock<24, 2>, LOG, Wei, Scale_T>(
-              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype, isasym);
-        }
-        if (_cd->AVX2()) {
-          benchmark<gemm::ICoreRowNAvx2vnniKBlock<24, 2>, LOG, Wei, Scale_T>(
-              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype, isasym);
-        }
-      }
-    }
-  }
-};
-#ifdef BTLA_UT_PROLOGUE_B
-#endif
-static UTWOQ_CompInt8 sUTWOQ_CompInt8;
-
-#if 0
-typedef struct {
-  float d;             // delta
-  uint8_t qs[32 / 2];  // nibbles / quants
-} block_q4_0;
-typedef struct {
-  float d;         // delta
-  uint8_t qs[32];  // nibbles / quants
-} block_q8_0;
-#ifdef __GNUC__
-#pragma GCC push_options
-#pragma GCC target("avx2", "fma", "f16c", "avxvnni")
-#endif
-#define __AVX2__
-// Unpack 32 4-bit fields into 32 bytes
-// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
-static inline __m256i bytes_from_nibbles_32(const uint8_t* rsi) {
-  const __m128i tmp = _mm_loadu_si128((const __m128i*)rsi);
-  const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp);
-  const __m256i lowMask = _mm256_set1_epi8(0xF);
-  return _mm256_and_si256(lowMask, bytes);
-}
-
-// add int16_t pairwise and return as float vector
-static inline __m256 sum_i16_pairs_float(const __m256i x) {
-  const __m256i ones = _mm256_set1_epi16(1);
-  const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
-  return _mm256_cvtepi32_ps(summed_pairs);
-}
-
-static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-#if __AVXVNNI__
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
-  return _mm256_cvtepi32_ps(summed_pairs);
-#else
-  // Perform multiplication and create 16-bit values
-  const __m256i dot = _mm256_maddubs_epi16(ax, sy);
-  return sum_i16_pairs_float(dot);
-#endif
-}
-
-// multiply int8_t, add results pairwise twice and return as float vector
-static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
-#if __AVXVNNIINT8__
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
-  return _mm256_cvtepi32_ps(summed_pairs);
-#else
-  // Get absolute values of x vectors
-  const __m256i ax = _mm256_sign_epi8(x, x);
-  // Sign the values of the y vectors
-  const __m256i sy = _mm256_sign_epi8(y, x);
-  return mul_sum_us8_pairs_float(ax, sy);
-#endif
-}
-
-// horizontally add 8 floats
-static inline float hsum_float_8(const __m256 x) {
-  __m128 res = _mm256_extractf128_ps(x, 1);
-  res = _mm_add_ps(res, _mm256_castps256_ps128(x));
-  res = _mm_add_ps(res, _mm_movehl_ps(res, res));
-  res = _mm_add_ss(res, _mm_movehdup_ps(res));
-  return _mm_cvtss_f32(res);
-}
-
-static void ne_vec_dot_q4_0_q8_0(const int n, float* s, const void* vx, const void* vy) {
-  const int qk = 32;
-  const int nb = n / qk;
-
-  assert(n % qk == 0);
-  assert(nb % 2 == 0);
-
-  const block_q4_0* x = (const block_q4_0*)vx;
-  const block_q8_0* y = (const block_q8_0*)vy;
-
-#if defined(__AVX2__)
-  // Initialize accumulator with zeros
-  __m256 acc = _mm256_setzero_ps();
-
-  // Main loop
-  for (int i = 0; i < nb; ++i) {
-    /* Compute combined scale for the block */
-    const __m256 d = _mm256_set1_ps((x[i].d) * (y[i].d));
-
-    __m256i bx = bytes_from_nibbles_32(x[i].qs);
-
-    // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-    const __m256i off = _mm256_set1_epi8(8);
-    bx = _mm256_sub_epi8(bx, off);
-
-    __m256i by = _mm256_loadu_si256((const __m256i*)y[i].qs);
-
-    const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-    /* Multiply q with scale and accumulate */
-    acc = _mm256_fmadd_ps(d, q, acc);
-  }
-
-  *s = hsum_float_8(acc);
-#elif defined(__AVX__)
-  // Initialize accumulator with zeros
-  __m256 acc = _mm256_setzero_ps();
-
-  // Main loop
-  for (int i = 0; i < nb; ++i) {
-    // Compute combined scale for the block
-    const __m256 d = _mm256_set1_ps(NE_FP16_TO_FP32(x[i].d) * NE_FP16_TO_FP32(y[i].d));
-
-    const __m128i lowMask = _mm_set1_epi8(0xF);
-    const __m128i off = _mm_set1_epi8(8);
-
-    const __m128i tmp = _mm_loadu_si128((const __m128i*)x[i].qs);
-
-    __m128i bx = _mm_and_si128(lowMask, tmp);
-    __m128i by = _mm_loadu_si128((const __m128i*)y[i].qs);
-    bx = _mm_sub_epi8(bx, off);
-    const __m128i i32_0 = mul_sum_i8_pairs(bx, by);
-
-    bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
-    by = _mm_loadu_si128((const __m128i*)(y[i].qs + 16));
-    bx = _mm_sub_epi8(bx, off);
-    const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
-
-    // Convert int32_t to float
-    __m256 p = _mm256_cvtepi32_ps(_mm256_set_m128i(i32_0, i32_1));
-
-    // Apply the scale, and accumulate
-    acc = _mm256_add_ps(_mm256_mul_ps(d, p), acc);
-  }
-
-  *s = hsum_float_8(acc);
-#elif defined(__SSSE3__)
-  // set constants
-  const __m128i lowMask = _mm_set1_epi8(0xF);
-  const __m128i off = _mm_set1_epi8(8);
-
-  // Initialize accumulator with zeros
-  __m128 acc_0 = _mm_setzero_ps();
-  __m128 acc_1 = _mm_setzero_ps();
-  __m128 acc_2 = _mm_setzero_ps();
-  __m128 acc_3 = _mm_setzero_ps();
-
-  // First round without accumulation
-  {
-    _mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0);
-    _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
-
-    // Compute combined scale for the block 0 and 1
-    const __m128 d_0_1 = _mm_set1_ps(NE_FP16_TO_FP32(x[0].d) * NE_FP16_TO_FP32(y[0].d));
-
-    const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i*)x[0].qs);
-
-    __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
-    __m128i by_0 = _mm_loadu_si128((const __m128i*)y[0].qs);
-    bx_0 = _mm_sub_epi8(bx_0, off);
-    const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
-
-    __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
-    __m128i by_1 = _mm_loadu_si128((const __m128i*)(y[0].qs + 16));
-    bx_1 = _mm_sub_epi8(bx_1, off);
-    const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
-
-    _mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0);
-    _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
-
-    // Compute combined scale for the block 2 and 3
-    const __m128 d_2_3 = _mm_set1_ps(NE_FP16_TO_FP32(x[1].d) * NE_FP16_TO_FP32(y[1].d));
-
-    const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i*)x[1].qs);
-
-    __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
-    __m128i by_2 = _mm_loadu_si128((const __m128i*)y[1].qs);
-    bx_2 = _mm_sub_epi8(bx_2, off);
-    const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
-
-    __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
-    __m128i by_3 = _mm_loadu_si128((const __m128i*)(y[1].qs + 16));
-    bx_3 = _mm_sub_epi8(bx_3, off);
-    const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
-
-    // Convert int32_t to float
-    __m128 p0 = _mm_cvtepi32_ps(i32_0);
-    __m128 p1 = _mm_cvtepi32_ps(i32_1);
-    __m128 p2 = _mm_cvtepi32_ps(i32_2);
-    __m128 p3 = _mm_cvtepi32_ps(i32_3);
-
-    // Apply the scale
-    acc_0 = _mm_mul_ps(d_0_1, p0);
-    acc_1 = _mm_mul_ps(d_0_1, p1);
-    acc_2 = _mm_mul_ps(d_2_3, p2);
-    acc_3 = _mm_mul_ps(d_2_3, p3);
-  }
-
-  // Main loop
-  for (int i = 2; i < nb; i += 2) {
-    _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
-    _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
-
-    // Compute combined scale for the block 0 and 1
-    const __m128 d_0_1 = _mm_set1_ps(NE_FP16_TO_FP32(x[i].d) * NE_FP16_TO_FP32(y[i].d));
-
-    const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i*)x[i].qs);
-
-    __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
-    __m128i by_0 = _mm_loadu_si128((const __m128i*)y[i].qs);
-    bx_0 = _mm_sub_epi8(bx_0, off);
-    const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
-
-    __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
-    __m128i by_1 = _mm_loadu_si128((const __m128i*)(y[i].qs + 16));
-    bx_1 = _mm_sub_epi8(bx_1, off);
-    const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
-
-    _mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
-    _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
-
-    // Compute combined scale for the block 2 and 3
-    const __m128 d_2_3 = _mm_set1_ps(NE_FP16_TO_FP32(x[i + 1].d) * NE_FP16_TO_FP32(y[i + 1].d));
-
-    const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i*)x[i + 1].qs);
-
-    __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
-    __m128i by_2 = _mm_loadu_si128((const __m128i*)y[i + 1].qs);
-    bx_2 = _mm_sub_epi8(bx_2, off);
-    const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
-
-    __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
-    __m128i by_3 = _mm_loadu_si128((const __m128i*)(y[i + 1].qs + 16));
-    bx_3 = _mm_sub_epi8(bx_3, off);
-    const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
-
-    // Convert int32_t to float
-    __m128 p0 = _mm_cvtepi32_ps(i32_0);
-    __m128 p1 = _mm_cvtepi32_ps(i32_1);
-    __m128 p2 = _mm_cvtepi32_ps(i32_2);
-    __m128 p3 = _mm_cvtepi32_ps(i32_3);
-
-    // Apply the scale
-    __m128 p0_d = _mm_mul_ps(d_0_1, p0);
-    __m128 p1_d = _mm_mul_ps(d_0_1, p1);
-    __m128 p2_d = _mm_mul_ps(d_2_3, p2);
-    __m128 p3_d = _mm_mul_ps(d_2_3, p3);
-
-    // Acummulate
-    acc_0 = _mm_add_ps(p0_d, acc_0);
-    acc_1 = _mm_add_ps(p1_d, acc_1);
-    acc_2 = _mm_add_ps(p2_d, acc_2);
-    acc_3 = _mm_add_ps(p3_d, acc_3);
-  }
-
-  *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
-#else
-  // scalar
-  float sumf = 0.0;
-
-  for (int i = 0; i < nb; i++) {
-    int sumi = 0;
-
-    for (int j = 0; j < qk / 2; ++j) {
-      const int v0 = (x[i].qs[j] & 0x0F) - 8;
-      const int v1 = (x[i].qs[j] >> 4) - 8;
-
-      sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk / 2]);
-    }
-    sumf += sumi * x[i].d * y[i].d;
-  }
-
-  *s = sumf;
-#endif
-}
-
-class UTWOQ_GGML {
- public:
-  UTWOQ_GGML() {
-    UT_START();
-    ut_q40();
-  }
-
-  void ut_q40() { benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4608, 4096, BTLA_DTYPE::S4_CLIP); }
-
   template <typename Core_T, typename LOG_T, template <class _T, BTLA_ISA> class Wei, typename Scale_T>
   void benchmark(int m, int n, int k, int batch, int blocksize, float* A, float* B, float* C, float timems, int threads,
-                 BTLA_DTYPE qtype) {
+                 BTLA_DTYPE qtype, bool isasym) {
     LOG_T log;
     using Parallel = parallel::gemm::SchedulerKBlockS<Core_T>;
     using Launcher =
@@ -1172,7 +789,7 @@ class UTWOQ_GGML {
     auto corestr = gemm::CoreAttr::to_str(Core_T::ID);
     utils::timer<std::chrono::milliseconds> tm;
     using WType = typename Wei<Core_T, Core_T::ISA>::StorageWeight;
-    WType tmpB = kernel.mProB.createStorage(n, k, blocksize, qtype, bestla_dtype<Scale_T>, bestla_dtype<float>, false);
+    WType tmpB = kernel.mProB.createStorage(n, k, blocksize, qtype, bestla_dtype<Scale_T>, bestla_dtype<float>, isasym);
     std::vector<WType> packBs(batch, 0);
     avector<int8_t> bufB(tmpB.mSize * batch);
     for (size_t i = 0; i < batch; i++) {
@@ -1188,27 +805,19 @@ class UTWOQ_GGML {
     utils::avector<int8_t> bufferA(quanA.mSize);
     quanA.assign(bufferA.data());
     auto psize = (size_t)m * n * k * 2;
-    int blks = updiv(k, blocksize);
-    std::vector<block_q4_0> QB(batch * n * blks);
-    std::vector<block_q8_0> QA(batch * m * blks);
-    auto memsize = sizeof(block_q4_0) * blks * n + sizeof(block_q8_0) * blks * m + m * n * sizeof(float);
-    int dr = updiv(n, threads);
-    parallel::gemm::SchedulerDispatcher<parallel::Scheduler2D> sch(
-        UT_Threading::get(), {UT_Threading::get()->num_threads(), 1, n, 1, Core_T::NTILE, 0, 0});
+    int blks = k / blocksize;
+    int nbits = utils::bestla_dtype_bits(qtype);
+    auto memsize = (size_t)(n * k * nbits / 8 + n * blks * sizeof(Scale_T)) + (m * k + m * n) * sizeof(float);
+    if (isasym) {
+      memsize += n * blks * sizeof(int8_t);
+    }
     tm.start();
     while (tm.stop() < timems) {
       for (int i = 0; i < batch; i++) {
         log.start();
-        auto qb = QB.data() + i * n * blks;
-        auto qa = QA.data() + i * m * blks;
-        auto cptr = C + i * m * n;
-        UT_Threading::get()->parallel_for([&](int idx) {
-          parallel::ThreadProblem2D thp{idx};
-          sch.getIndex(thp);
-          for (int ir = thp.loc[1]; ir < thp.loc[1] + thp.size[1]; ir++) {
-            ne_vec_dot_q4_0_q8_0(k, cptr + ir, qb + ir * blks, qa);
-          }
-        });
+        GemmProblem gp(1, m, n, k, blocksize);
+        typename Launcher::Param args{gp, {A + i * m * k, k, &quanA}, {&packBs[i]}, {C + i * m * n, n}};
+        parallel::GemmRunWithA<Parallel>(kernel, args, UT_Threading::get());
         log.stop();
         if (tm.stop() >= timems) {
           break;
@@ -1223,11 +832,11 @@ class UTWOQ_GGML {
   }
 
   template <template <class _T, BTLA_ISA> class Wei, typename Scale_T>
-  void benchmark_all(int m, int n, int k, BTLA_DTYPE qtype) {
+  void benchmark_all(int m, int n, int k, BTLA_DTYPE qtype, bool isasym = false) {
     auto memsize = gemm_memsize(m, n, k, BTLA_DTYPE::F32, qtype, BTLA_DTYPE::F32);
     int batch = auto_batch(memsize);
-    printf("%d %d %d %d %s %s %s\n", m, n, k, batch, bestla_dtype_str(BTLA_DTYPE::F32), bestla_dtype_str(qtype),
-           bestla_dtype_str(BTLA_DTYPE::F32));
+    printf("%d %d %d %d %s %s %s Asym:%d Scale:%s\n", m, n, k, batch, bestla_dtype_str(BTLA_DTYPE::F32),
+           bestla_dtype_str(qtype), bestla_dtype_str(BTLA_DTYPE::F32), isasym, bestla_dtype_str(bestla_dtype<Scale_T>));
     avector<float> A(size_t(m) * k * batch);
     avector<float> B(size_t(k) * n);
     avector<float> C(size_t(m) * n * batch);
@@ -1241,467 +850,58 @@ class UTWOQ_GGML {
     GetCPUDevice();
     auto threads_cfg = UT_Threading::get_threads_config();
     for (auto threads : threads_cfg) {
-      for (auto blocksize : {32}) {
-        if (_cd->AVX2()) {
-          benchmark<gemm::ICoreRowNAvxvnniKBlock<24, 2>, LOG, Wei, Scale_T>(
-              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype);
-          benchmark<gemm::ICoreRowNAvxvnniKBlock<48, 1>, LOG, Wei, Scale_T>(
-              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype);
+      for (auto blocksize : {k}) {
+        if (_cd->AMX_INT8() && blocksize % 64 == 0) {
+          benchmark_pc<gemm::ICoreRowNAmxint8SS<64, 16>, LOG, Wei, Scale_T>(
+              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype, isasym);
         }
-      }
-    }
-  }
-};
-static UTWOQ_GGML sUTWOQ_GGML;
-#include "kernel_avx2.h"
-#define AVX_VNNI_ 1
-template <int NTILE, typename SBT>
-static void bestla_vec_dot_q4_0_q8_0(const int k_reduce, const int blocksize, float* out, const uint8_t* a_ptr,
-                                     const float* a_scale, const uint8_t* b_ptr, const SBT* b_scale, int b_step) {
-  const int k_blks = k_reduce / blocksize;
-  int constexpr NReg = NTILE / 8;
-  // Initialize accumulator with zeros
-  __m256 acc[NReg];
-  for (int i = 0; i < NReg; i++) {
-    acc[i] = _mm256_setzero_ps();
-  }
-  uint32_t mask = 0xf0f0f0f0;
-  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
-  const __m256i ones = _mm256_set1_epi16(1);
-  // Main loop
-  for (int ib = 0; ib < k_blks; ++ib) {
-    /* Compute combined scale for the block */
-    __m256i iacc[NReg];
-    for (int i = 0; i < NReg; i++) {
-      iacc[i] = _mm256_setzero_si256();
-    }
-    for (int ik = 0; ik < blocksize; ik += 4) {
-      auto va = _mm256_set1_epi32(*(int*)(a_ptr + ib * blocksize + ik));
-      for (int i = 0; i < NReg; i++) {
-        auto vb = kernel::avx2::unpack_4bits_avx2<BTLA_DTYPE::S4_CLIP>(
-            (void*)(b_ptr + i * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
-#if AVX_VNNI_
-        iacc[i] = _mm256_dpbusd_avx_epi32(iacc[i], va, vb);
-#else
-        __m256i dot = _mm256_maddubs_epi16(va, vb);  // overflow of int16
-        __m256i summed_pairs = _mm256_madd_epi16(ones, dot);
-        iacc[i] = _mm256_add_epi32(iacc[i], summed_pairs);
-#endif
-      }
-    }
-    const __m256 v_a_scale = _mm256_set1_ps(*(a_scale + ib));
-    for (int i = 0; i < NReg; i++) {
-      __m256 v_b_scale;
-      if constexpr (std::is_same_v<SBT, float>) {
-        v_b_scale = _mm256_loadu_ps(b_scale + ib * b_step + i * 8);
-      } else if constexpr (std::is_same_v<SBT, utils::bf16>) {
-        auto tmp = _mm_loadu_si128((const __m128i*)(b_scale + ib * b_step + i * 8));
-        v_b_scale = kernel::avx2::ymm_cvt_bf16_fp32(tmp);
-      }
-      v_b_scale = _mm256_mul_ps(v_a_scale, v_b_scale);
-      auto tmp = _mm256_cvtepi32_ps(iacc[i]);
-      acc[i] = _mm256_fmadd_ps(tmp, v_b_scale, acc[i]);
-    }
-  }
-  for (int i = 0; i < NReg; i++) {
-    _mm256_storeu_ps(out + i * 8, acc[i]);
-  }
-}
-
-template <int NTILE, typename SBT>
-static void bestla_vec_dot_q4_0_f32(const int k_reduce, const int blocksize, float* out, const float* a_ptr,
-                                    const uint8_t* b_ptr, const SBT* b_scale, int b_step) {
-  const int k_blks = k_reduce / blocksize;
-  int constexpr NReg = NTILE / 8;
-  // Initialize accumulator with zeros
-  __m256 acc[NReg];
-  for (int i = 0; i < NReg; i++) {
-    acc[i] = _mm256_setzero_ps();
-  }
-  uint32_t mask = 0xf0f0f0f0;
-  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
-  // Main loop
-  for (int ib = 0; ib < k_blks; ++ib) {
-#if 1
-    __m256 v_b_scale[NReg];
-    for (int i = 0; i < NReg; i++) {
-      if constexpr (std::is_same_v<SBT, float>) {
-        v_b_scale[i] = _mm256_loadu_ps(b_scale + ib * b_step + i * 8);
-      } else if constexpr (std::is_same_v<SBT, utils::bf16>) {
-        auto tmp = _mm_loadu_si128((const __m128i*)(b_scale + ib * b_step + i * 8));
-        v_b_scale[i] = kernel::avx2::ymm_cvt_bf16_fp32(tmp);
-      }
-    }
-    int constexpr Unroll = 4;
-    int8_t tmpbuf[NTILE * Unroll];
-    for (int ik = 0; ik < blocksize; ik += Unroll) {
-      for (int i = 0; i < NReg; i++) {
-        auto vb =
-            kernel::avx2::unpack_4bits_avx2<BTLA_DTYPE::S4_CLIP>((void*)(b_ptr + i * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
-        _mm256_storeu_si256((__m256i*)(tmpbuf + 32 * i), vb);
-      }
-      for (int ikk = 0; ikk < Unroll; ikk++) {
-        auto va = _mm256_set1_ps(*(a_ptr + ib * blocksize + ik + ikk));
-        for (int i = 0; i < NReg; i++) {
-          auto tmp = _mm_loadl_epi64((const __m128i*)(tmpbuf + i * 8 + ikk * NTILE));
-          auto s32tmp = _mm256_cvtepi8_epi32(tmp);
-          auto ftmp = _mm256_cvtepi32_ps(s32tmp);
-          ftmp = _mm256_mul_ps(ftmp, v_b_scale[i]);
-          acc[i] = _mm256_fmadd_ps(va, ftmp, acc[i]);
+        if (_cd->AVX512_VNNI()) {
+          benchmark_pc<gemm::ICoreRowNAvx512vnni<48, 8>, LOG, Wei, Scale_T>(
+              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype, isasym);
         }
-      }
-    }
-#else
-    __m256 acc_local[NReg];
-    for (int i = 0; i < NReg; i++) {
-      acc_local[i] = _mm256_setzero_ps();
-    }
-    int constexpr Unroll = 4;
-    int8_t tmpbuf[NTILE * Unroll];
-    for (int ik = 0; ik < blocksize; ik += Unroll) {
-      if constexpr (NTILE == 24) {
-        auto vb = kernel::avx2::unpack_4bits_avx2<BTLA_DTYPE::S4_CLIP>(
-            (void*)(b_ptr + 0 * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
-        auto va = _mm256_set1_ps(*(a_ptr + ib * blocksize + ik + 0));
-        auto s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
-        vb = _mm256_permute4x64_epi64(vb, 57);
-        auto ftmp = _mm256_cvtepi32_ps(s32tmp);
-        acc_local[0] = _mm256_fmadd_ps(va, ftmp, acc_local[0]);
-
-        s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
-        vb = _mm256_permute4x64_epi64(vb, 57);
-        ftmp = _mm256_cvtepi32_ps(s32tmp);
-        acc_local[1] = _mm256_fmadd_ps(va, ftmp, acc_local[1]);
-
-        s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
-        vb = _mm256_permute4x64_epi64(vb, 57);
-        ftmp = _mm256_cvtepi32_ps(s32tmp);
-        acc_local[2] = _mm256_fmadd_ps(va, ftmp, acc_local[2]);
-        va = _mm256_set1_ps(*(a_ptr + ib * blocksize + ik + 1));
-
-        s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
-        vb = _mm256_permute4x64_epi64(vb, 57);
-        ftmp = _mm256_cvtepi32_ps(s32tmp);
-        acc_local[0] = _mm256_fmadd_ps(va, ftmp, acc_local[0]);
-
-        vb = kernel::avx2::unpack_4bits_avx2<BTLA_DTYPE::S4_CLIP>(
-            (void*)(b_ptr + 1 * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
-
-        s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
-        vb = _mm256_permute4x64_epi64(vb, 57);
-        ftmp = _mm256_cvtepi32_ps(s32tmp);
-        acc_local[1] = _mm256_fmadd_ps(va, ftmp, acc_local[1]);
-
-        s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
-        vb = _mm256_permute4x64_epi64(vb, 57);
-        ftmp = _mm256_cvtepi32_ps(s32tmp);
-        acc_local[2] = _mm256_fmadd_ps(va, ftmp, acc_local[2]);
-        va = _mm256_set1_ps(*(a_ptr + ib * blocksize + ik + 2));
-
-        s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
-        vb = _mm256_permute4x64_epi64(vb, 57);
-        ftmp = _mm256_cvtepi32_ps(s32tmp);
-        acc_local[0] = _mm256_fmadd_ps(va, ftmp, acc_local[0]);
-
-        s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
-        vb = _mm256_permute4x64_epi64(vb, 57);
-        ftmp = _mm256_cvtepi32_ps(s32tmp);
-        acc_local[1] = _mm256_fmadd_ps(va, ftmp, acc_local[1]);
-
-        vb = kernel::avx2::unpack_4bits_avx2<BTLA_DTYPE::S4_CLIP>(
-            (void*)(b_ptr + 2 * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
-
-        s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
-        vb = _mm256_permute4x64_epi64(vb, 57);
-        ftmp = _mm256_cvtepi32_ps(s32tmp);
-        acc_local[2] = _mm256_fmadd_ps(va, ftmp, acc_local[2]);
-        va = _mm256_set1_ps(*(a_ptr + ib * blocksize + ik + 3));
-
-        s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
-        vb = _mm256_permute4x64_epi64(vb, 57);
-        ftmp = _mm256_cvtepi32_ps(s32tmp);
-        acc_local[0] = _mm256_fmadd_ps(va, ftmp, acc_local[0]);
-
-        s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
-        vb = _mm256_permute4x64_epi64(vb, 57);
-        ftmp = _mm256_cvtepi32_ps(s32tmp);
-        acc_local[1] = _mm256_fmadd_ps(va, ftmp, acc_local[1]);
-
-        s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
-        vb = _mm256_permute4x64_epi64(vb, 57);
-        ftmp = _mm256_cvtepi32_ps(s32tmp);
-        acc_local[2] = _mm256_fmadd_ps(va, ftmp, acc_local[2]);
-      } else {
-        for (int i = 0; i < NReg; i++) {
-          auto vb = kernel::avx2::unpack_4bits_avx2<BTLA_DTYPE::S4_CLIP>(
-              (void*)(b_ptr + i * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
-          _mm256_storeu_si256((__m256i*)(tmpbuf + 32 * i), vb);
+        if (_cd->AVX512BW()) {
+          benchmark_pc<gemm::ICoreRowNAvx512bw<48, 8>, LOG, Wei, Scale_T>(m, n, k, batch, blocksize, A.data(), B.data(),
+                                                                          C.data(), testtime, threads, qtype, isasym);
         }
-        for (int ikk = 0; ikk < Unroll; ikk++) {
-          auto va = _mm256_set1_ps(*(a_ptr + ib * blocksize + ik + ikk));
-          for (int i = 0; i < NReg; i++) {
-            auto tmp = _mm_loadl_epi64((const __m128i*)(tmpbuf + i * 8 + ikk * NTILE));
-            auto s32tmp = _mm256_cvtepi8_epi32(tmp);
-            auto ftmp = _mm256_cvtepi32_ps(s32tmp);
-            acc_local[i] = _mm256_fmadd_ps(va, ftmp, acc_local[i]);
-          }
+        if (_cd->AVX_VNNI()) {
+          benchmark_pc<gemm::ICoreRowNAvxvnni<24, 4>, LOG, Wei, Scale_T>(m, n, k, batch, blocksize, A.data(), B.data(),
+                                                                         C.data(), testtime, threads, qtype, isasym);
         }
-      }
-    }
-    for (int i = 0; i < NReg; i++) {
-      __m256 v_b_scale;
-      if constexpr (std::is_same_v<SBT, float>) {
-        v_b_scale = _mm256_loadu_ps(b_scale + ib * b_step + i * 8);
-      } else if constexpr (std::is_same_v<SBT, utils::bf16>) {
-        auto tmp = _mm_loadu_si128((const __m128i*)(b_scale + ib * b_step + i * 8));
-        v_b_scale = kernel::avx2::ymm_cvt_bf16_fp32(tmp);
-      }
-      acc[i] = _mm256_fmadd_ps(acc_local[i], v_b_scale, acc[i]);
-    }
-#endif
-  }
-  for (int i = 0; i < NReg; i++) {
-    _mm256_storeu_ps(out + i * 8, acc[i]);
-  }
-}
-
-class UTWOQ_S4_VecDot {
- public:
-  UTWOQ_S4_VecDot() {
-    UT_START();
-     benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4608, 4096, BTLA_DTYPE::S4_CLIP);
-     benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4608, 4096, BTLA_DTYPE::S4_CLIP);
-    benchmark_all_fp32<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4608, 4096, BTLA_DTYPE::S4_CLIP);
-    benchmark_all_fp32<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4608, 4096, BTLA_DTYPE::S4_CLIP);
-  }
-
-  template <typename Core_T, typename LOG_T, template <class _T, BTLA_ISA> class Wei, typename Scale_T>
-  void benchmark(int m, int n, int k, int batch, int blocksize, float* A, float* B, float* C, float timems, int threads,
-                 BTLA_DTYPE qtype) {
-    LOG_T log;
-    using Parallel = parallel::gemm::SchedulerKBlockS<Core_T>;
-    using Launcher =
-        wrapper::gemm::LauncherIntKBlock<Core_T::ISA, Core_T, prologue_a::gemm::ActivationF32KBlockQuantize, Wei,
-                                         epilogue::gemm::AccumulatorWriteBackFp32>;
-    Launcher kernel;
-    UT_Threading::set_threads(threads);
-    auto corestr = gemm::CoreAttr::to_str(Core_T::ID);
-    utils::timer<std::chrono::milliseconds> tm;
-    using WType = typename Wei<Core_T, Core_T::ISA>::StorageWeight;
-    WType tmpB = kernel.mProB.createStorage(n, k, blocksize, qtype, bestla_dtype<Scale_T>, bestla_dtype<float>, false);
-    std::vector<WType> packBs(batch, 0);
-    avector<int8_t> bufB(tmpB.mSize * batch);
-    for (size_t i = 0; i < batch; i++) {
-      packBs[i] = tmpB;
-      packBs[i].assign(bufB.data() + i * tmpB.mSize);
-    }
-    kernel.mProB.packWeight(n, k, B, n, &packBs[0], UT_Threading::get());
-    for (size_t i = 1; i < batch; i++) {
-      memcpy(packBs[i].template WPtr<void>(), packBs[0].template WPtr<void>(), packBs[0].template WSize<char>());
-      memcpy(packBs[i].template SPtr<void>(), packBs[0].template SPtr<void>(), packBs[0].CSize() * sizeof(Scale_T));
-    }
-    auto quanA = kernel.mProA.createStorage(m, k, blocksize, false);
-    std::vector<storage::gemm::StorageQuantActivation> As(batch);
-    utils::avector<int8_t> bufferA(quanA.mSize * batch);
-    for (size_t i = 0; i < batch; i++) {
-      As[i] = quanA;
-      As[i].assign(bufferA.data() + i * quanA.mSize);
-    }
-    kernel.mProA.quantize({A, k, &As[0]}, m, k, UT_Threading::get());
-    for (size_t i = 1; i < batch; i++) {
-      memcpy(As[i].template APtr<void>(), As[0].template APtr<void>(), As[0].template ASize<char>());
-      memcpy(As[i].template SPtr<void>(), As[0].template SPtr<void>(), As[0].CSize() * sizeof(float));
-    }
-    auto psize = (size_t)m * n * k * 2;
-    auto blks = updiv(k, blocksize);
-    auto memsize =
-        (size_t)(n * k / 2 + n * blks * sizeof(Scale_T)) + (m * k + m * blks * sizeof(float)) + (m * n) * sizeof(float);
-    assert(m == 1);
-    // parallel::Scheduler2D sch({UT_Threading::get()->num_threads(), 1, n, 1, Core_T::NTILE, 0, 0});
-    parallel::gemm::SchedulerDispatcher<parallel::Scheduler2D> sch(
-        UT_Threading::get(), {UT_Threading::get()->num_threads(), 1, n, 1, Core_T::NTILE, 0, 0});
-    int bcount = 0;
-    tm.start();
-    while (tm.stop() < timems) {
-      for (int i = 0; i < batch; i++) {
-        log.start();
-        auto cbptr = C + i * m * n;
-        auto awptr = As[i].template APtr<uint8_t>();
-        auto asptr = As[i].template SPtr<float>();
-        auto bwptr = packBs[i].template WPtr<uint8_t>();
-        auto bsptr = packBs[i].template SPtr<Scale_T>();
-        UT_Threading::get()->parallel_for([&](int idx) {
-          parallel::ThreadProblem2D thp{idx};
-          sch.getIndex(thp);
-          if (thp.valid) {
-            for (int in = 0; in < thp.size[1]; in += Core_T::NTILE) {
-              bestla_vec_dot_q4_0_q8_0<Core_T::NTILE>(k, blocksize, cbptr + thp.loc[1] + in, awptr, asptr,
-                                                      bwptr + (thp.loc[1] + in) * k / 2, bsptr + thp.loc[1] + in, n);
-            }
-          }
-        });
-        log.stop();
-        bcount += 1;
-        if (tm.stop() >= timems) {
-          break;
+        if (_cd->AVX2()) {
+          benchmark_pc<gemm::ICoreRowNAvx2vnni<24, 4>, LOG, Wei, Scale_T>(m, n, k, batch, blocksize, A.data(), B.data(),
+                                                                          C.data(), testtime, threads, qtype, isasym);
         }
       }
     }
-    log.record();
-    double flops = double(psize) / log.min_val / 1e6;
-    double band = double(memsize) / log.min_val / 1e6;
-    printf("Threads %d Block %d %s %s Flops:%.3fG PerCoreFlops:%.3fG MemoryBandwidth:%.3fGB/s\n", threads, blocksize,
-           corestr, log.get_log_str(), flops, flops / threads, band);
-
-    // avector<float> refC(m * n);
-    // avector<float> revB(n * k);
-    // kernel.mProB.unpackWeight(n, k, &packBs[0], revB.data(), n, UT_Threading::get());
-    // gemmref_fp32fp32fp32(m, n, k, A, revB.data(), refC.data(), k, n, n);
-    // bcount = std::min(bcount, batch);
-    // for (size_t i = 0; i < bcount; i++) {
-    //   buffer_error(refC.data(), C + i * m * n, m * n, 0.1f);
-    // }
-  }
 
-  template <template <class _T, BTLA_ISA> class Wei, typename Scale_T>
-  void benchmark_all(int m, int n, int k, BTLA_DTYPE qtype) {
-    auto memsize = gemm_memsize(m, n, k, BTLA_DTYPE::F32, qtype, BTLA_DTYPE::F32);
-    int batch = auto_batch(memsize);
-    printf("%d %d %d %d %s %s %s\n", m, n, k, batch, bestla_dtype_str(BTLA_DTYPE::F32), bestla_dtype_str(qtype),
-           bestla_dtype_str(BTLA_DTYPE::F32));
-    avector<float> A(size_t(m) * k * batch);
-    avector<float> B(size_t(k) * n);
-    avector<float> C(size_t(m) * n * batch);
-    fill_buffer_randn(A.data(), k * m, (0.01f), (0.5f));
-    fill_buffer_randn(B.data(), k * n, (-0.5f), (0.5f));
-    for (int i = 1; i < batch; i++) {
-      memcpy(A.data() + i * m * k, A.data(), m * k * sizeof(float));
-    }
-    using LOG = timer_statistics_logger<TestMs / 2>;
-    float testtime = float(TestMs);
-    GetCPUDevice();
-    auto threads_cfg = UT_Threading::get_threads_config();
     for (auto threads : threads_cfg) {
       for (auto blocksize : {32, 128}) {
-        if (_cd->AMX_INT8()) {
+        if (_cd->AMX_INT8() && blocksize % 64 == 0) {
           benchmark<gemm::ICoreRowNAmxint8KBlock<64, 16>, LOG, Wei, Scale_T>(
-              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype);
+              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype, isasym);
         }
         if (_cd->AVX512_VNNI()) {
           benchmark<gemm::ICoreRowNAvx512vnniKBlock<48, 4>, LOG, Wei, Scale_T>(
-              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype);
-          benchmark<gemm::ICoreRowNAvx512vnniKBlock<96, 2>, LOG, Wei, Scale_T>(
-              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype);
+              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype, isasym);
+        }
+        if (_cd->AVX512BW()) {
+          benchmark<gemm::ICoreRowNAvx512bwKBlock<48, 8>, LOG, Wei, Scale_T>(
+              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype, isasym);
         }
         if (_cd->AVX_VNNI()) {
           benchmark<gemm::ICoreRowNAvxvnniKBlock<24, 2>, LOG, Wei, Scale_T>(
-              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype);
-          // benchmark<gemm::ICoreRowNAvxvnniKBlock<48, 1>, LOG, Wei, Scale_T>(
-          //     m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype);
-        }
-      }
-    }
-  }
-
-  template <typename Core_T, typename LOG_T, template <class _T, BTLA_ISA> class Wei, typename Scale_T>
-  void benchmark_fp32(int m, int n, int k, int batch, int blocksize, float* A, float* B, float* C, float timems,
-                      int threads, BTLA_DTYPE qtype) {
-    LOG_T log;
-    using Parallel = parallel::gemm::SchedulerKBlockS<Core_T>;
-    using Launcher = wrapper::gemm::LauncherIntKBlock<Core_T::ISA, Core_T, prologue_a::gemm::ActivationBase, Wei,
-                                                      epilogue::gemm::AccumulatorWriteBackFp32>;
-    Launcher kernel;
-    UT_Threading::set_threads(threads);
-    auto corestr = gemm::CoreAttr::to_str(Core_T::ID);
-    utils::timer<std::chrono::milliseconds> tm;
-    using WType = typename Wei<Core_T, Core_T::ISA>::StorageWeight;
-    WType tmpB = kernel.mProB.createStorage(n, k, blocksize, qtype, bestla_dtype<Scale_T>, bestla_dtype<float>, false);
-    std::vector<WType> packBs(batch, 0);
-    avector<int8_t> bufB(tmpB.mSize * batch);
-    for (size_t i = 0; i < batch; i++) {
-      packBs[i] = tmpB;
-      packBs[i].assign(bufB.data() + i * tmpB.mSize);
-    }
-    kernel.mProB.packWeight(n, k, B, n, &packBs[0], UT_Threading::get());
-    for (size_t i = 1; i < batch; i++) {
-      memcpy(packBs[i].template WPtr<void>(), packBs[0].template WPtr<void>(), packBs[0].template WSize<char>());
-      memcpy(packBs[i].template SPtr<void>(), packBs[0].template SPtr<void>(), packBs[0].CSize() * sizeof(Scale_T));
-    }
-
-    auto psize = (size_t)m * n * k * 2;
-    auto blks = updiv(k, blocksize);
-    auto memsize = (size_t)(n * k / 2 + n * blks * sizeof(Scale_T)) + (m * k * sizeof(float)) + (m * n) * sizeof(float);
-    assert(m == 1);
-    // parallel::Scheduler2D sch({UT_Threading::get()->num_threads(), 1, n, 1, Core_T::NTILE, 0, 0});
-    parallel::gemm::SchedulerDispatcher<parallel::Scheduler2D> sch(
-        UT_Threading::get(), {UT_Threading::get()->num_threads(), 1, n, 1, Core_T::NTILE, 0, 0});
-    tm.start();
-    while (tm.stop() < timems) {
-      for (int i = 0; i < batch; i++) {
-        log.start();
-        auto cbptr = C + i * m * n;
-        auto aptr = A + i * m * k;
-        auto bwptr = packBs[i].template WPtr<uint8_t>();
-        auto bsptr = packBs[i].template SPtr<Scale_T>();
-        UT_Threading::get()->parallel_for([&](int idx) {
-          parallel::ThreadProblem2D thp{idx};
-          sch.getIndex(thp);
-          if (thp.valid) {
-            for (int in = 0; in < thp.size[1]; in += Core_T::NTILE) {
-              bestla_vec_dot_q4_0_f32<Core_T::NTILE>(k, blocksize, cbptr + thp.loc[1] + in, aptr,
-                                                     bwptr + (thp.loc[1] + in) * k / 2, bsptr + thp.loc[1] + in, n);
-            }
-          }
-        });
-        log.stop();
-        if (tm.stop() >= timems) {
-          break;
+              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype, isasym);
         }
-      }
-    }
-    log.record();
-    double flops = double(psize) / log.min_val / 1e6;
-    double band = double(memsize) / log.min_val / 1e6;
-    printf("Threads %d Block %d %s %s Flops:%.3fG PerCoreFlops:%.3fG MemoryBandwidth:%.3fGB/s\n", threads, blocksize,
-           corestr, log.get_log_str(), flops, flops / threads, band);
-
-    /* avector<float> refC(m * n);
-     avector<float> revB(n * k);
-     kernel.mProB.unpackWeight(n, k, &packBs[0], revB.data(), n, UT_Threading::get());
-     gemmref_fp32fp32fp32(m, n, k, A, revB.data(), refC.data(), k, n, n);
-     for (size_t i = 0; i < batch; i++) {
-       buffer_error(refC.data(), C + i * m * n, m * n, 0.1f);
-     }*/
-  }
-  template <template <class _T, BTLA_ISA> class Wei, typename Scale_T>
-  void benchmark_all_fp32(int m, int n, int k, BTLA_DTYPE qtype) {
-    auto memsize = gemm_memsize(m, n, k, BTLA_DTYPE::F32, qtype, BTLA_DTYPE::F32);
-    int batch = auto_batch(memsize);
-    printf("%d %d %d %d %s %s %s\n", m, n, k, batch, bestla_dtype_str(BTLA_DTYPE::F32), bestla_dtype_str(qtype),
-           bestla_dtype_str(BTLA_DTYPE::F32));
-    avector<float> A(size_t(m) * k * batch);
-    avector<float> B(size_t(k) * n);
-    avector<float> C(size_t(m) * n * batch);
-    fill_buffer_randn(A.data(), k * m, (0.01f), (0.5f));
-    fill_buffer_randn(B.data(), k * n, (-0.5f), (0.5f));
-    for (int i = 1; i < batch; i++) {
-      memcpy(A.data() + i * m * k, A.data(), m * k * sizeof(float));
-    }
-    using LOG = timer_statistics_logger<TestMs / 2>;
-    float testtime = float(TestMs);
-    GetCPUDevice();
-    auto threads_cfg = UT_Threading::get_threads_config();
-    for (auto threads : threads_cfg) {
-      for (auto blocksize : {32, 128}) {
         if (_cd->AVX2()) {
-          benchmark_fp32<gemm::SCoreRowNAvx2<24, 4>, LOG, Wei, Scale_T>(m, n, k, batch, blocksize, A.data(), B.data(),
-                                                                        C.data(), testtime, threads, qtype);
+          benchmark<gemm::ICoreRowNAvx2vnniKBlock<24, 2>, LOG, Wei, Scale_T>(
+              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype, isasym);
         }
       }
     }
   }
 };
-//static UTWOQ_S4_VecDot sUTWOQ_S4_VecDot;
+#ifdef BTLA_UT_PROLOGUE_B
+static UTWOQ_CompInt8 sUTWOQ_CompInt8;
 #endif
 }  // namespace ut
 }  // namespace bestla
diff --git a/bestla/bestla/ut/bestla_epilogue.cpp b/bestla/bestla/ut/bestla_epilogue.cpp
index d1d293b26..fd70775e0 100644
--- a/bestla/bestla/ut/bestla_epilogue.cpp
+++ b/bestla/bestla/ut/bestla_epilogue.cpp
@@ -35,11 +35,11 @@ class UT_AccumulatorWriteBack {
     std::vector<bf16> src(_M * _N);
     for (int i = 0; i < _M * _N; i++) src[i].fromfloat(i);
     std::vector<float> dstref(_M * _N, 0), dstker(_M * _N, 0);
-    epilogue::gemm::AccumulatorWriteBackBf16Fp32<_RT_ISA_T> ker;
-    epilogue::gemm::AccumulatorWriteBackBf16Fp32<BTLA_ISA::NoSIMD> kerref;
 
-    kerref.forward(src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {dstref.data(), _N}, cache, CacheSize);
-    ker.forward(src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {dstker.data(), _N}, cache, CacheSize);
+    epilogue::gemm::AccumulatorWriteBackBf16Fp32::template forward<BTLA_ISA::NoSIMD>(
+        src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {dstref.data(), _N}, cache, CacheSize);
+    epilogue::gemm::AccumulatorWriteBackBf16Fp32::template forward<_RT_ISA_T>(
+        src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {dstker.data(), _N}, cache, CacheSize);
     ut::buffer_error(dstref.data(), dstker.data(), dstref.size());
   }
   template <BTLA_ISA _RT_ISA_T>
@@ -48,13 +48,13 @@ class UT_AccumulatorWriteBack {
     std::vector<float> src(_M * _N);
     for (int i = 0; i < _M * _N; i++) src[i] = float(i);
     std::vector<uint16_t> dstref(_M * _N, 0), dstker(_M * _N, 0);
-    epilogue::gemm::AccumulatorWriteBackFp32Bf16<_RT_ISA_T> ker;
-    epilogue::gemm::AccumulatorWriteBackFp32Bf16<BTLA_ISA::NoSIMD> kerref;
 
-    kerref.forward(src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {reinterpret_cast<bf16*>(dstref.data()), _N},
-                   cache, CacheSize);
-    ker.forward(src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {reinterpret_cast<bf16*>(dstker.data()), _N},
-                cache, CacheSize);
+    epilogue::gemm::AccumulatorWriteBackFp32Bf16::template forward<BTLA_ISA::NoSIMD>(
+        src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {reinterpret_cast<bf16*>(dstref.data()), _N}, cache,
+        CacheSize);
+    epilogue::gemm::AccumulatorWriteBackFp32Bf16::template forward<_RT_ISA_T>(
+        src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {reinterpret_cast<bf16*>(dstker.data()), _N}, cache,
+        CacheSize);
     ut::buffer_error<uint16_t>(dstref.data(), dstker.data(), dstref.size());
   }
   template <BTLA_ISA _RT_ISA_T>
@@ -63,11 +63,11 @@ class UT_AccumulatorWriteBack {
     std::vector<float> src(_M * _N);
     for (int i = 0; i < _M * _N; i++) src[i] = float(i);
     std::vector<float> dstref(_M * _N, 0), dstker(_M * _N, 0);
-    epilogue::gemm::AccumulatorWriteBackFp32<_RT_ISA_T> ker;
-    epilogue::gemm::AccumulatorWriteBackFp32<BTLA_ISA::NoSIMD> kerref;
 
-    kerref.forward(src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {dstref.data(), _N}, cache, CacheSize);
-    ker.forward(src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {dstker.data(), _N}, cache, CacheSize);
+    epilogue::gemm::AccumulatorWriteBackFp32::template forward<BTLA_ISA::NoSIMD>(
+        src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {dstref.data(), _N}, cache, CacheSize);
+    epilogue::gemm::AccumulatorWriteBackFp32::template forward<_RT_ISA_T>(
+        src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {dstker.data(), _N}, cache, CacheSize);
     ut::buffer_error<float>(dstref.data(), dstker.data(), dstref.size());
   }
   template <BTLA_ISA _RT_ISA_T>
@@ -76,8 +76,8 @@ class UT_AccumulatorWriteBack {
     std::vector<float> src(_M * _N);
     for (int i = 0; i < _M * _N; i++) src[i] = float(i);
     std::vector<float> dstref(_M * _N, 0), dstker(_M * _N, 0);
-    epilogue::gemm::AccumulatorWriteBackWithGeluFp32<_RT_ISA_T> ker;
-    ker.forward(src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {dstker.data(), _N}, cache, CacheSize);
+    epilogue::gemm::AccumulatorWriteBackWithGeluFp32 ::template forward<_RT_ISA_T>(
+        src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {dstker.data(), _N}, cache, CacheSize);
     auto gelu = [&](float x) {
       return 0.5f * x * (1.f + tanhf(0.7978845834732056f * (x + 0.044714998453855515f * x * x * x)));
     };
@@ -92,9 +92,8 @@ class UT_AccumulatorWriteBack {
     for (int i = 0; i < _M * _N; i++) src[i] = float(i);
     std::vector<float> dstref(_M * _N, 0), dstker(_M * _N, 0);
     float elt_const_v[] = {-1.0f};
-    epilogue::gemm::AccumulatorWriteBackWithSwishFp32<_RT_ISA_T> ker;
-    ker.forward(src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {dstker.data(), _N, elt_const_v}, cache,
-                CacheSize);
+    epilogue::gemm::AccumulatorWriteBackWithSwishFp32::template forward<_RT_ISA_T>(
+        src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {dstker.data(), _N, elt_const_v}, cache, CacheSize);
     auto swish = [&](float x) { return x / (1 + exp(-x)); };
     for (int i = 0; i < _M * _N; i++) src[i] = swish(src[i]);
     ut::buffer_error<float>(src.data(), dstker.data(), dstker.size(), 0.2f);  // swish use low lprecision exp
@@ -125,12 +124,13 @@ class UT_AlphaBetaProcessFp32 {
     for (int i = 0; i < src.size(); i++) {
       src[i] = float(i);
     }
-    epilogue::gemm::AlphaBetaProcessFp32<BTLA_ISA::NoSIMD> kernref;
-    epilogue::gemm::AlphaBetaProcessFp32<BTLA_ISA::AVX512F> kern0;
-    kernref.forward(src.data(), _srcstep, 0, 0, _M, _N, {dstref.data(), src1.data(), _dststep, _src1step, alpha, beta},
-                    cache, CacheSize);
-    kern0.forward(src.data(), _srcstep, 0, 0, _M, _N, {dst.data(), src1.data(), _dststep, _src1step, alpha, beta},
-                  cache, CacheSize);
+    using Epi = epilogue::gemm::AlphaBetaProcessFp32;
+    Epi::template forward<BTLA_ISA::NoSIMD>(src.data(), _srcstep, 0, 0, _M, _N,
+                                            {dstref.data(), src1.data(), _dststep, _src1step, alpha, beta}, cache,
+                                            CacheSize);
+    Epi::template forward<BTLA_ISA::AVX512F>(src.data(), _srcstep, 0, 0, _M, _N,
+                                             {dst.data(), src1.data(), _dststep, _src1step, alpha, beta}, cache,
+                                             CacheSize);
     ut::buffer_error<float>(dstref.data(), dst.data(), dstref.size());
   }
 };
diff --git a/bestla/bestla/ut/bestla_parallel.cpp b/bestla/bestla/ut/bestla_parallel.cpp
index 93a47a898..274b6c5ab 100644
--- a/bestla/bestla/ut/bestla_parallel.cpp
+++ b/bestla/bestla/ut/bestla_parallel.cpp
@@ -5,7 +5,6 @@
 #include "bestla_ut.h"
 #include "bestla_prologue_a.h"
 
-#ifdef BTLA_UT_PARALLEL
 namespace bestla {
 using namespace utils;
 namespace ut {
@@ -39,7 +38,7 @@ class UT_OMPThreading {
     buffer_error(ref.data(), dst.data(), ref.size());
   }
 };
-#ifdef JBLAS_UT_PARALLEL
+#ifdef BTLA_UT_PARALLEL
 static UT_OMPThreading sUT_OMPThreading;
 #endif
 #endif
@@ -73,7 +72,7 @@ class UT_StdThreading {
     buffer_error(ref.data(), dst.data(), ref.size());
   }
 };
-#ifdef JBLAS_UT_PARALLEL
+#ifdef BTLA_UT_PARALLEL
 static UT_StdThreading sUT_StdThreading;
 #endif
 
@@ -97,7 +96,7 @@ class UT_Scheduler2D {
     prb.print();
   }
 };
-#ifdef JBLAS_UT_PARALLEL
+#ifdef BTLA_UT_PARALLEL
 static UT_Scheduler2D sUT_Scheduler2D;
 #endif
 
@@ -105,9 +104,10 @@ class UT_SchedulerGemmBase {
  public:
   UT_SchedulerGemmBase() {
     UT_START();
+    ut<gemm::ICoreRowNAmxint8<64, 16>>(2024, 11008, 4096, 48, 2048 * 1024, 32 * 1024);
     ut<gemm::ICoreRowNAmxint8<48, 16>>(2048, 4096, 4096, 48, 2048 * 1024, 32 * 1024);
-    ut<gemm::SCoreRowNAvx512f<48, 8>>(1, 4096, 4096, 24);
     ut<gemm::ICoreRowNAmxint8SS<32, 32>>(2048, 4096, 4096, 24);
+    ut<gemm::SCoreRowNAvx512f<48, 8>>(1, 4096, 4096, 24);
     ut<gemm::ICoreRowNAmxint8SS<32, 32>>(4, 4096, 4096, 48);
   }
 
@@ -125,48 +125,10 @@ class UT_SchedulerGemmBase {
     prb.print();
   }
 };
-#ifdef JBLAS_UT_PARALLEL
+#ifdef BTLA_UT_PARALLEL
 static UT_SchedulerGemmBase sUT_SchedulerGemmBase;
 #endif
 
-class UT_SchedulerGemmKBlock {
- public:
-  UT_SchedulerGemmKBlock() {
-    UT_START();
-    ut<gemm::SCoreRowNAvx512f<48, 8>>(1, 4096, 4096, 32, 24);
-    ut<gemm::SCoreRowNAvx512f<48, 8>>(1, 4096, 4096, 64, 22, 32 * 1024);
-    ut<gemm::SCoreRowNAvx512f<48, 8>>(1, 4096, 4096, 128, 24);
-    ut<gemm::SCoreRowNAvx512f<48, 8>>(1, 4096, 4096, 1024, 24);
-    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(1, 4096, 4096, 64, 24, 32 * 1024);
-    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(2048, 4096, 4096, 64, 24);
-    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(2048, 4096, 4096, 4096, 24);
-    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(2048, 4096, 4096, 4096, 48);
-    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(2048, 4096, 4096, 4096, 56);
-    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(2048, 4096, 4096, 32, 56);
-    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(4, 4096, 4096, 128, 48);
-    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(4, 4096, 3072, 32, 48);
-    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(2048, 4096, 3072, 3072, 48);
-    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(2048, 4096, 3072, 32, 56);
-  }
-
-  template <class GemmCore_T>
-  void ut(int m, int n, int k, int kblock, int threads, size_t l1cache = 0) {
-    printf("%s %d %d %d %d %d core:%s\n", __FUNCTION__, m, n, k, kblock, threads,
-           gemm::CoreAttr::to_str(GemmCore_T::ID));
-    parallel::gemm::SchedulerKBlock<GemmCore_T> sch;
-    GetCPUDevice();
-    utils::GemmProblem gp(1, m, n, k, kblock);
-    sch.update({threads, gp, 0, 0, _cd->getL2CacheSize(), l1cache == 0 ? _cd->getL1CacheSize() : l1cache});
-    sch.print();
-    parallel::gemm::ThreadProblemBase prb{sch.valid_theads() - 1};
-    sch.getIndex(prb);
-    prb.print();
-  }
-};
-#ifdef JBLAS_UT_PARALLEL
-static UT_SchedulerGemmKBlock sUT_SchedulerGemmKBlock;
-#endif
-
 class UT_SchedulerGemmKBlockNew {
  public:
   UT_SchedulerGemmKBlockNew() {
@@ -203,9 +165,8 @@ class UT_SchedulerGemmKBlockNew {
     prb.print();
   }
 };
-#ifdef JBLAS_UT_PARALLEL
+#ifdef BTLA_UT_PARALLEL
 static UT_SchedulerGemmKBlockNew sUT_SchedulerGemmKBlockNew;
 #endif
 }  // namespace ut
 }  // namespace bestla
-#endif
diff --git a/bestla/bestla/ut/bestla_prologue_b.cpp b/bestla/bestla/ut/bestla_prologue_b.cpp
index 7009f3705..7f540f891 100644
--- a/bestla/bestla/ut/bestla_prologue_b.cpp
+++ b/bestla/bestla/ut/bestla_prologue_b.cpp
@@ -228,98 +228,6 @@ class UT_BlockQunatize_SN {
 // static UT_BlockQunatize_SN sUT_BlockQunatize_SN;
 #endif
 
-class UT_S3_WOQ {
- public:
-  UT_S3_WOQ() {
-    UT_START();
-    CheckISA(AVX2);
-    ut<sAVX2, BTLA_ISA::AVX2>(1, 4096, 4096, 32, 8);
-    CheckISA(AVX_VNNI);
-    ut<gemm::ICoreRowNAvxvnniKBlock<24, 2>, BTLA_ISA::AVX_VNNI>(1, 4096, 4096, 128, 8);
-    CheckISA(AVX512F);
-    ut<sAVX512F, BTLA_ISA::AVX512F>(1, 4096, 4096, 32, 56);
-    CheckISA(AVX512_VNNI);
-    ut<gemm::ICoreRowNAvx512vnniKBlock<48, 4>, BTLA_ISA::AVX512_VNNI>(1, 4096, 4096, 128, 56);
-    CheckISA(AMX_BF16);
-    ut<sAMX_BF16, BTLA_ISA::AMX_BF16>(1, 4096, 4096, 32, 56);
-    CheckISA(AMX_INT8);
-    ut<gemm::ICoreRowNAmxint8KBlock<48, 16>, BTLA_ISA::AMX_INT8>(1, 4096, 4096, 128, 56);
-  }
-
-  template <class GemmCore_T, BTLA_ISA ISA>
-  void ut(int m, int n, int k, int blocksize, int enable_thr) {
-    UT_Threading::set_threads(enable_thr);
-    printf("%s:%d %d %d %d\n", __FUNCTION__, m, n, k, blocksize);
-    int ldb = n;
-
-    int kblk_num = utils::updiv(k, blocksize);
-    utils::aligned_vector<float> scales(kblk_num * n);
-    ut::fill_buffer_randn(scales.data(), scales.size(), 0.005f, 0.01f);
-    ut::UT_vector_s8 quanW;
-    quanW.resize(k * n);
-    quanW.fill_rand(-4, 3);
-
-    using PrologueB = prologue_b::gemm::WeightKBlockNInteger<GemmCore_T, ISA>;
-
-    PrologueB kernel;
-    auto ptr = kernel.createStorage(n, k, blocksize, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32, BTLA_DTYPE::F32, false);
-    auto ptr_ref = kernel.createStorage(n, k, blocksize, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32, BTLA_DTYPE::F32, false);
-    avector<int8_t> buffer(ptr.mSize);
-    avector<int8_t> buffer_ref(ptr_ref.mSize);
-    ptr.assign(buffer.data());
-    ptr_ref.assign(buffer_ref.data());
-    kernel.packQWeight(n, k, quanW.data(), ldb, scales.data(), nullptr, &ptr, UT_Threading::get());
-    kernel.packQWeight(n, k, quanW.data(), ldb, scales.data(), nullptr, &ptr_ref, UT_Threading::get());
-    using Launcher =
-        wrapper::gemm::LauncherBase<ISA, GemmCore_T, prologue_a::gemm::ActivationBase,
-                                    prologue_b::gemm::WeightKBlockNInteger, epilogue::gemm::AccumulatorWriteBackFp32>;
-    using Parallel = parallel::gemm::SchedulerKBlock<GemmCore_T>;
-
-    Launcher launcher;
-    avector<float> matC(m * n), refC(m * n);
-    if constexpr (ISA == BTLA_ISA::AVX512F || ISA == BTLA_ISA::AVX2) {
-      avector<float> matAf32(m * k);
-      fill_buffer_randn(matAf32.data(), matAf32.size(), -0.5f, 0.5f);
-      utils::GemmProblem gp(1, m, n, k, blocksize);
-      typename Launcher::Param args{gp, {matAf32.data(), k}, {&ptr}, {matC.data(), n}};
-      parallel::GemmRun<Parallel>(launcher, args, UT_Threading::get());
-      typename Launcher::Param args_ref{gp, {matAf32.data(), k}, {&ptr_ref}, {refC.data(), n}};
-      parallel::GemmRun<Parallel>(launcher, args_ref, UT_Threading::get());
-    } else if constexpr (ISA == BTLA_ISA::AMX_BF16) {
-      avector<utils::bf16> matAbf16(m * k);
-      fill_buffer_randn(matAbf16.data(), matAbf16.size(), utils::bf16(-0.5f), utils::bf16(0.5f));
-      GemmProblem gp(1, m, n, k, blocksize);
-      typename Launcher::Param args{gp, {matAbf16.data(), k}, {&ptr}, {matC.data(), n}};
-      parallel::GemmRun<Parallel>(launcher, args, UT_Threading::get());
-      typename Launcher::Param args_ref{gp, {matAbf16.data(), k}, {&ptr_ref}, {refC.data(), n}};
-      parallel::GemmRun<Parallel>(launcher, args_ref, UT_Threading::get());
-    } else {
-      using Launcher2 = wrapper::gemm::LauncherIntKBlock<ISA, GemmCore_T, prologue_a::gemm::ActivationF32KBlockQuantize,
-                                                         prologue_b::gemm::WeightKBlockNInteger,
-                                                         epilogue::gemm::AccumulatorWriteBackFp32>;
-      Launcher2 launcher;
-      using Parallel2 = parallel::gemm::SchedulerKBlockS<GemmCore_T>;
-      avector<float> matAf32(m * k);
-      fill_buffer_randn(matAf32.data(), matAf32.size(), -0.5f, 0.5f);
-      auto quanA = launcher.mProA.createStorage(m, k, blocksize, false);
-      auto quanA_ref = launcher.mProA.createStorage(m, k, blocksize, false);
-      utils::avector<int8_t> bufferA(quanA.mSize);
-      utils::avector<int8_t> bufferA_ref(quanA.mSize);
-      quanA.assign(bufferA.data());
-      quanA_ref.assign(bufferA_ref.data());
-      GemmProblem gp(1, m, n, k, blocksize);
-      typename Launcher2::Param args{gp, {matAf32.data(), k, &quanA}, {&ptr}, {matC.data(), n}};
-      parallel::GemmRunWithA<Parallel2>(launcher, args, UT_Threading::get());
-      typename Launcher2::Param args_ref{gp, {matAf32.data(), k, &quanA_ref}, {&ptr_ref}, {refC.data(), n}};
-      parallel::GemmRunWithA<Parallel2>(launcher, args_ref, UT_Threading::get());
-    }
-    buffer_error(matC.data(), refC.data(), matC.size(), 0.001f);
-  }
-};
-#ifdef BTLA_UT_PROLOGUE_B
-static UT_S3_WOQ sUT_S3_WOQ;
-#endif
-
 class UT_TransposeBlockQuantize_F4 {
  public:
   UT_TransposeBlockQuantize_F4() {
@@ -649,113 +557,35 @@ class UT_CompFp32 {
  public:
   UT_CompFp32() {
     UT_START();
-    ut_s7();
-    ut_s6();
-    ut_s5();
-    ut_s4();
-    ut_s2();
-    ut_s1();
-    ut_s3();
-    ut_s8();
+    ut_new_type(BTLA_DTYPE::S1_CLIP);
+    ut_new_type(BTLA_DTYPE::S2_CLIP);
+    ut_new_type(BTLA_DTYPE::S3_CLIP);
+    ut_new_type(BTLA_DTYPE::S5_CLIP);
+    ut_new_type(BTLA_DTYPE::S6_CLIP);
+    ut_new_type(BTLA_DTYPE::S7_CLIP);
+    ut_new_type(BTLA_DTYPE::S8);
+    ut_s4_full();
 
     ut_f4();
     ut_f8();
   }
 
-  void ut_s1() {
+  void ut_new_type(BTLA_DTYPE qtype) {
     GetCPUDevice();
     if (_cd->AVX2()) {
-      ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::BF16,
-                                                            false);
-      ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32,
-                                                            true);
-      ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32,
-                                                            false);
-      ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32,
-                                                            false);
-      ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32,
-                                                            true);
-      ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32,
-                                                            false);
+      ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 16, qtype, BTLA_DTYPE::BF16, true);
+      ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(4, 4096, 4096, 32, qtype, BTLA_DTYPE::F32, true);
+      ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 128, qtype, BTLA_DTYPE::F32, true);
+      ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, -1, qtype, BTLA_DTYPE::BF16, false);
     }
     if (_cd->AVX512F()) {
-      ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::BF16,
-                                                               false);
-      ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32,
-                                                               true);
-      ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32,
-                                                               false);
-      ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32,
-                                                               false);
-      ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32,
-                                                               true);
-      ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32,
-                                                               false);
+      ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 16, qtype, BTLA_DTYPE::BF16, true);
+      ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(4, 4096, 4096, 32, qtype, BTLA_DTYPE::F32, true);
+      ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 128, qtype, BTLA_DTYPE::F32, true);
+      ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, -1, qtype, BTLA_DTYPE::BF16, false);
     }
   }
 
-  void ut_s2() {
-    GetCPUDevice();
-    if (_cd->AVX2()) {
-      ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::BF16,
-                                                            false);
-      ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
-                                                            true);
-      ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
-                                                            false);
-      ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
-                                                            false);
-      ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
-                                                            true);
-      ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
-                                                            false);
-    }
-    if (_cd->AVX512F()) {
-      ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::BF16,
-                                                               false);
-      ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
-                                                               true);
-      ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
-                                                               false);
-      ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
-                                                               false);
-      ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
-                                                               true);
-      ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
-                                                               false);
-    }
-  }
-
-  void ut_s3() {
-    CheckISA(AVX2);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32,
-                                                          false);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32,
-                                                          true);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32,
-                                                          false);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32,
-                                                          true);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32,
-                                                          false);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32,
-                                                          false);
-
-    CheckISA(AVX512F);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32,
-                                                             false);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32,
-                                                             true);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32,
-                                                             false);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32,
-                                                             true);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32,
-                                                             false);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32,
-                                                             false);
-  }
-
   void ut_f8() {
     CheckISA(AVX2);
     ut<sAVX2, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, 32, BTLA_DTYPE::F8_E4M3, BTLA_DTYPE::F8_E8M0);
@@ -769,146 +599,20 @@ class UT_CompFp32 {
     ut<sAVX512F, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, 32, BTLA_DTYPE::F8_E5M2, BTLA_DTYPE::F32);
   }
 
-  void ut_s4() {
-    CheckISA(AVX2);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32,
-                                                          true);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32,
-                                                          false);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32,
-                                                          false);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32,
-                                                          false);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::BF16,
-                                                          false);
-
-    CheckISA(AVX512F);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32,
-                                                             false);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32,
-                                                             true);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32,
-                                                             false);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32,
-                                                             false);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::BF16,
-                                                             false);
-  }
-
-  void ut_s5() {
-    CheckISA(AVX2);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32,
-                                                          true);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32,
-                                                          false);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32,
-                                                          false);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32,
-                                                          false);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::BF16,
-                                                          false);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32,
-                                                          true);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32,
-                                                          false);
-    CheckISA(AVX512F);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32,
-                                                             true);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32,
-                                                             false);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32,
-                                                             false);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32,
-                                                             false);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::BF16,
-                                                             false);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32,
-                                                             true);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32,
-                                                             false);
-  }
-
-  void ut_s6() {
-    CheckISA(AVX2);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32,
-                                                          true);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32,
-                                                          false);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32,
-                                                          false);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32,
-                                                          false);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::BF16,
-                                                          false);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32,
-                                                          true);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32,
-                                                          false);
-    CheckISA(AVX512F);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32,
-                                                             true);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32,
-                                                             false);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32,
-                                                             false);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32,
-                                                             false);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::BF16,
-                                                             false);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32,
-                                                             true);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32,
-                                                             false);
-  }
-
-  void ut_s7() {
-    CheckISA(AVX2);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32,
-                                                          true);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32,
-                                                          false);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32,
-                                                          false);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32,
-                                                          false);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::BF16,
-                                                          false);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32,
-                                                          true);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32,
-                                                          false);
-    CheckISA(AVX512F);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32,
-                                                             true);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32,
-                                                             false);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32,
-                                                             false);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32,
-                                                             false);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::BF16,
-                                                             false);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32,
-                                                             true);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32,
-                                                             false);
-  }
-
-  void ut_s8() {
+  void ut_s4_full() {
+    BTLA_DTYPE qtype = BTLA_DTYPE::S4_CLIP;
     CheckISA(AVX2);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S8, BTLA_DTYPE::BF16, false);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S8, BTLA_DTYPE::BF16, true);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S8, BTLA_DTYPE::F32, false);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S8, BTLA_DTYPE::F32, false);
-    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S8, BTLA_DTYPE::F32, false);
+    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 16, qtype, BTLA_DTYPE::BF16, true);
+    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(4, 4096, 4096, 32, qtype, BTLA_DTYPE::F32, true);
+    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 128, qtype, BTLA_DTYPE::F32, true);
+    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, -1, qtype, BTLA_DTYPE::BF16, false);
 
     CheckISA(AVX512F);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S8, BTLA_DTYPE::BF16,
-                                                             false);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S8, BTLA_DTYPE::F32, false);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S8, BTLA_DTYPE::F32,
-                                                             false);
-    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S8, BTLA_DTYPE::F32, false);
+    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, qtype, BTLA_DTYPE::F32, false);
+    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, qtype, BTLA_DTYPE::F32, true);
+    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, qtype, BTLA_DTYPE::F32, false);
+    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, qtype, BTLA_DTYPE::F32, false);
+    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, qtype, BTLA_DTYPE::BF16, false);
   }
 
   void ut_f4() {
@@ -943,7 +647,7 @@ class UT_CompFp32 {
     using Launcher =
         wrapper::gemm::LauncherBase<ISA, GemmCore_T, prologue_a::gemm::ActivationKBlockBaseF32,
                                     prologue_b::gemm::WeightKBlockNInteger, epilogue::gemm::AccumulatorWriteBackFp32>;
-    using Parallel = parallel::gemm::SchedulerKBlock<GemmCore_T>;
+    using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
     Launcher launcher;
     blocksize = blocksize == -1 ? k : blocksize;
     using WType = typename Wei<GemmCore_T, ISA>::StorageWeight;
@@ -986,7 +690,7 @@ class UT_CompFp32 {
     auto constexpr ISA = GemmCore_T::ISA;
     using Launcher = wrapper::gemm::LauncherBase<ISA, GemmCore_T, prologue_a::gemm::ActivationBase, Wei,
                                                  epilogue::gemm::AccumulatorWriteBackFp32>;
-    using Parallel = parallel::gemm::SchedulerKBlock<GemmCore_T>;
+    using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
     Launcher launcher;
     blocksize = blocksize == -1 ? k : blocksize;
     using WType = typename Wei<GemmCore_T, ISA>::StorageWeight;
@@ -1017,253 +721,114 @@ class UT_CompInt8 {
  public:
   UT_CompInt8() {
     UT_START();
-    ut_s7();
-    ut_s6();
-    ut_s5();
-    ut_s4();
-    ut_s2();
-    ut_s1();
-    ut_s3();
-  }
-
-  void ut_s2() {
-    GetCPUDevice();
-    if (_cd->AVX2()) {
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(1, 4096, 4096, 16, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::BF16);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(1, 4096, 4096, 128, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32);
-    }
-    if (_cd->AVX_VNNI()) {
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 16, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::BF16);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 128, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32);
-    }
-    if (_cd->AVX512_VNNI()) {
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
-                                                           true);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 16, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::BF16);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(2, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
-                                                           true);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(8, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
-                                                           true);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(8, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 128, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32);
-    }
-    if (_cd->AMX_INT8()) {
-      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<48, 16>>(128, 4096, 4096, 128, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<48, 16>>(1, 4096, 4096, 64, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32);
-    }
+    ut_s4_full();
+    ut_new_dtype(BTLA_DTYPE::S1_CLIP);
+    ut_new_dtype(BTLA_DTYPE::S2_CLIP);
+    ut_new_dtype(BTLA_DTYPE::S3_CLIP);
+    ut_new_dtype(BTLA_DTYPE::S5_CLIP);
+    ut_new_dtype(BTLA_DTYPE::S6_CLIP);
+    ut_new_dtype(BTLA_DTYPE::S7_CLIP);
   }
 
-  void ut_s1() {
+  void ut_new_dtype(BTLA_DTYPE qtype) {
     GetCPUDevice();
     if (_cd->AVX2()) {
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(1, 4096, 4096, 16, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::BF16);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(1, 4096, 4096, 128, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(1, 4096, 4096, 32, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(4, 4096, 4096, 64, qtype, BTLA_DTYPE::BF16);
+      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(8, 4096, 4096, 128, qtype, BTLA_DTYPE::F32, true);
     }
     if (_cd->AVX_VNNI()) {
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 16, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::BF16);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 128, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(4, 4096, 4096, 64, qtype, BTLA_DTYPE::BF16);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(8, 4096, 4096, 128, qtype, BTLA_DTYPE::F32, true);
     }
     if (_cd->AVX512_VNNI()) {
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 32, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32,
-                                                           true);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 16, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::BF16);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(2, 4096, 4096, 32, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32,
-                                                           true);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(8, 4096, 4096, 32, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32,
-                                                           true);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(8, 4096, 4096, 32, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 32, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 128, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32);
-    }
-    if (_cd->AMX_INT8()) {
-      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<48, 16>>(128, 4096, 4096, 128, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<48, 16>>(1, 4096, 4096, 64, BTLA_DTYPE::S1_CLIP, BTLA_DTYPE::F32);
-    }
-  }
-
-  void ut_s7() {
-    GetCPUDevice();
-    if (_cd->AVX_VNNI()) {
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 16, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::BF16);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 128, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 32, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(4, 4096, 4096, 64, qtype, BTLA_DTYPE::BF16);
+      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(8, 4096, 4096, 128, qtype, BTLA_DTYPE::F32, true);
     }
     if (_cd->AVX512BW()) {
-      ut_newkblock<gemm::ICoreRowNAvx512bwKBlock<48, 8>>(1, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvx512bwKBlock<48, 8>>(1, 4096, 4096, 16, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::BF16);
-      ut_newkblock<gemm::ICoreRowNAvx512bwKBlock<48, 8>>(2, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvx512bwKBlock<48, 8>>(8, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvx512bwKBlock<48, 8>>(8, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx512bwKBlock<48, 8>>(1, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx512bwKBlock<48, 8>>(1, 4096, 4096, 128, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32);
-    }
-    if (_cd->AVX512_VNNI()) {
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32,
-                                                           true);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 16, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::BF16);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(2, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32,
-                                                           true);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(8, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32,
-                                                           true);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(8, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 32, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 128, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32);
-    }
-    if (_cd->AMX_INT8()) {
-      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<48, 16>>(128, 4096, 4096, 128, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<48, 16>>(1, 4096, 4096, 64, BTLA_DTYPE::S7_CLIP, BTLA_DTYPE::F32);
-    }
-  }
-
-  void ut_s3() {
-    GetCPUDevice();
-    if (_cd->AVX2()) {
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(1, 4096, 4096, 128, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32);
-    }
-    if (_cd->AVX_VNNI()) {
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 128, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32);
-    }
-    if (_cd->AVX512_VNNI()) {
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32,
-                                                           true);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(8, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32,
-                                                           true);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 128, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAvx512bwKBlock<48, 8>>(1, 4096, 4096, 32, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvx512bwKBlock<48, 8>>(4, 4096, 4096, 64, qtype, BTLA_DTYPE::BF16);
+      ut_newkblock<gemm::ICoreRowNAvx512bwKBlock<48, 8>>(8, 4096, 4096, 128, qtype, BTLA_DTYPE::F32, true);
     }
     if (_cd->AMX_INT8()) {
-      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<48, 16>>(128, 4096, 4096, 128, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<48, 16>>(1, 4096, 4096, 64, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<64, 16>>(1, 4096, 4096, 64, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<64, 16>>(8, 4096, 4096, 64, qtype, BTLA_DTYPE::BF16);
+      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<64, 16>>(8, 4096, 4096, 128, qtype, BTLA_DTYPE::F32, true);
     }
   }
 
-  void ut_s4() {
+  void ut_s4_full() {
     GetCPUDevice();
+    auto qtype = BTLA_DTYPE::S4_CLIP;
     if (_cd->AVX2()) {
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::BF16);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(1, 4096, 4096, 32, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(4, 4096, 4096, 64, qtype, BTLA_DTYPE::BF16);
+      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(8, 4096, 4096, 128, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(1, 4096, 4096, 32, qtype, BTLA_DTYPE::DQ8_BNB);
+      ut_newkblock_pc<gemm::ICoreRowNAvx2vnni<24, 4>>(1, 4096, 4096, 4096, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock_pc<gemm::ICoreRowNAvx2vnni<24, 4>>(8, 4096, 4096, 4096, qtype, BTLA_DTYPE::F32, true);
+
+      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlockSS<24, 2>>(1, 4096, 4096, 32, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlockSS<24, 2>>(4, 4096, 4096, 64, qtype, BTLA_DTYPE::BF16);
+      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlockSS<24, 2>>(8, 4096, 4096, 128, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlockSS<24, 2>>(1, 4096, 4096, 32, qtype, BTLA_DTYPE::DQ8_BNB);
+      ut_newkblock_pc<gemm::ICoreRowNAvx2vnniSS<24, 2>>(1, 4096, 4096, 4096, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock_pc<gemm::ICoreRowNAvx2vnniSS<24, 2>>(8, 4096, 4096, 4096, qtype, BTLA_DTYPE::F32, true);
     }
     if (_cd->AVX_VNNI()) {
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::BF16);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
-
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::BF16);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::DQ8_BNB);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(4, 4096, 4096, 64, qtype, BTLA_DTYPE::BF16);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(8, 4096, 4096, 128, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, qtype, BTLA_DTYPE::DQ8_BNB);
+      ut_newkblock_pc<gemm::ICoreRowNAvxvnni<24, 4>>(1, 4096, 4096, 4096, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock_pc<gemm::ICoreRowNAvxvnni<24, 4>>(8, 4096, 4096, 4096, qtype, BTLA_DTYPE::F32, true);
+
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlockSS<24, 2>>(1, 4096, 4096, 32, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlockSS<24, 2>>(4, 4096, 4096, 64, qtype, BTLA_DTYPE::BF16);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlockSS<24, 2>>(8, 4096, 4096, 128, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlockSS<24, 2>>(1, 4096, 4096, 32, qtype, BTLA_DTYPE::DQ8_BNB);
+      ut_newkblock_pc<gemm::ICoreRowNAvxvnniSS<24, 4>>(1, 4096, 4096, 4096, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock_pc<gemm::ICoreRowNAvxvnniSS<24, 4>>(8, 4096, 4096, 4096, qtype, BTLA_DTYPE::F32, true);
     }
 
     if (_cd->AVX512_VNNI()) {
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 11008, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(8, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::DQ8_BNB);
+      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 32, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(4, 4096, 4096, 64, qtype, BTLA_DTYPE::BF16);
+      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(8, 4096, 4096, 128, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 32, qtype, BTLA_DTYPE::DQ8_BNB);
+      ut_newkblock_pc<gemm::ICoreRowNAvx512vnni<48, 8>>(1, 4096, 4096, 4096, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock_pc<gemm::ICoreRowNAvx512vnni<48, 8>>(8, 4096, 4096, 4096, qtype, BTLA_DTYPE::F32, true);
     }
 
-    if (_cd->AMX_INT8()) {
-      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<48, 16>>(128, 4096, 4096, 128, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<48, 16>>(1, 4096, 4096, 64, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<48, 16>>(128, 4096, 4096, 128, BTLA_DTYPE::S4_CLIP,
-                                                         BTLA_DTYPE::DQ8_BNB);
-    }
-  }
-
-  void ut_s5() {
-    GetCPUDevice();
-    if (_cd->AVX2()) {
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::BF16);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32);
-    }
-    if (_cd->AVX_VNNI()) {
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::BF16);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32);
-
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::BF16);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32);
-    }
-    if (_cd->AVX512_VNNI()) {
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 11008, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(2, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(8, 4096, 4096, 32, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32);
+    if (_cd->AVX512BW()) {
+      ut_newkblock<gemm::ICoreRowNAvx512bwKBlock<48, 8>>(1, 4096, 4096, 32, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvx512bwKBlock<48, 8>>(4, 4096, 4096, 64, qtype, BTLA_DTYPE::BF16);
+      ut_newkblock<gemm::ICoreRowNAvx512bwKBlock<48, 8>>(8, 4096, 4096, 128, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvx512bwKBlock<48, 8>>(1, 4096, 4096, 32, qtype, BTLA_DTYPE::DQ8_BNB);
+      ut_newkblock_pc<gemm::ICoreRowNAvx512bw<48, 8>>(1, 4096, 4096, 4096, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock_pc<gemm::ICoreRowNAvx512bw<48, 8>>(8, 4096, 4096, 4096, qtype, BTLA_DTYPE::F32, true);
     }
 
     if (_cd->AMX_INT8()) {
-      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<48, 16>>(128, 4096, 4096, 128, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<48, 16>>(1, 4096, 4096, 64, BTLA_DTYPE::S5_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAmxint8SSKBlock<64, 16>>(1, 4096, 4096, 64, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAmxint8SSKBlock<64, 16>>(8, 4096, 4096, 64, qtype, BTLA_DTYPE::BF16);
+      ut_newkblock<gemm::ICoreRowNAmxint8SSKBlock<64, 16>>(8, 4096, 4096, 128, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAmxint8SSKBlock<64, 16>>(1, 4096, 4096, 128, qtype, BTLA_DTYPE::DQ8_BNB);
+      ut_newkblock_pc<gemm::ICoreRowNAmxint8SS<64, 16>>(1, 4096, 4096, 4096, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock_pc<gemm::ICoreRowNAmxint8SS<64, 16>>(8, 4096, 4096, 4096, qtype, BTLA_DTYPE::F32, true);
+
+      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<64, 16>>(1, 4096, 4096, 128, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<64, 16>>(1, 4096, 4096, 64, qtype, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<64, 16>>(8, 4096, 4096, 128, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<64, 16>>(128, 4096, 4096, 128, qtype, BTLA_DTYPE::DQ8_BNB);
+      ut_newkblock_pc<gemm::ICoreRowNAmxint8<64, 16>>(1, 4096, 4096, 4096, qtype, BTLA_DTYPE::F32, true);
+      ut_newkblock_pc<gemm::ICoreRowNAmxint8<64, 16>>(8, 4096, 4096, 4096, qtype, BTLA_DTYPE::F32, true);
     }
   }
 
-  void ut_s6() {
-    GetCPUDevice();
-    if (_cd->AVX2()) {
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::BF16);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32);
-    }
-    if (_cd->AVX_VNNI()) {
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::BF16);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32, true);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32);
-
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::BF16);
-      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32);
-    }
-    if (_cd->AVX512_VNNI()) {
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 11008, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(2, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(8, 4096, 4096, 32, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32);
-    }
-    if (_cd->AMX_INT8()) {
-      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<48, 16>>(128, 4096, 4096, 128, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32);
-      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<48, 16>>(1, 4096, 4096, 64, BTLA_DTYPE::S6_CLIP, BTLA_DTYPE::F32);
-    }
-  }
+  using PcWriteBack = epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue::gemm::AccumulatorWriteBackFp32>;
 
   template <class GemmCore_T>
   void ut_newkblock(int m, int n, int k, int blocksize, BTLA_DTYPE qtype, BTLA_DTYPE stype, bool isAsym = false) {
@@ -1316,6 +881,66 @@ class UT_CompInt8 {
       buffer_error(refCupk.data(), matC.data(), refCupk.size(), DQ_INT8_ERR);  // dynamic quant error
     }
   }
+
+  template <class GemmCore_T>
+  void ut_newkblock_pc(int m, int n, int k, int blocksize, BTLA_DTYPE qtype, BTLA_DTYPE stype, bool isAsym = false) {
+    assert(blocksize >= k);
+    printf("Test Case %s: %d %d %d-%d type:%s core:%s scaletype:%s Asym:%d\n", __FUNCTION__, m, n, k, blocksize,
+           bestla_dtype_str(qtype), gemm::CoreAttr::to_str(GemmCore_T::ID), bestla_dtype_str(stype), isAsym);
+    auto constexpr ISA = GemmCore_T::ISA;
+    using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
+    using Launcher =
+        wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ActivationF32KBlockQuantize,
+                                    prologue_b::gemm::WeightKBlockNInteger, PcWriteBack>;
+    Launcher launcher;
+    blocksize = blocksize == -1 ? k : blocksize;
+    int kblks = updiv(k, blocksize);
+    using WType = typename Launcher::PrologueB::StorageWeight;
+    WType packedw = launcher.mProB.createStorage(n, k, blocksize, qtype, stype, bestla_dtype<float>, isAsym);
+    utils::avector<int8_t> buffer(packedw.mSize);
+    packedw.assign(buffer.data());
+    avector<float> matBf32(k * n), matAf32(m * k), matC(m * n), refC(m * n), refCupk(m * n);
+    fill_buffer_randn(matBf32.data(), matBf32.size(), -0.5f, 0.5f);
+    avector<uint8_t> matAu8(m * k), zpAu8(m * kblks);
+    avector<float> scaleAf32(m * kblks);
+    fill_buffer_randn(matAu8.data(), matAu8.size(), uint8_t(0), uint8_t(127));
+    fill_buffer_randn(zpAu8.data(), zpAu8.size(), uint8_t(60), uint8_t(64));
+    fill_buffer_randn(scaleAf32.data(), scaleAf32.size(), 0.001f, 0.005f);
+    ut::fill_buffer_randn(matBf32.data(), matBf32.size(), -0.5f, 0.5f);
+    avector<float> reduceAf32(m * kblks, 0.f);
+    for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < k; j++) {
+        matAf32[i * k + j] =
+            (float(matAu8[i * k + j]) - zpAu8[i * kblks + j / blocksize]) * scaleAf32[i * kblks + j / blocksize];
+        reduceAf32[i * kblks + j / blocksize] += matAf32[i * k + j];
+      }
+    }
+    launcher.mProB.packWeight(n, k, matBf32.data(), n, &packedw, UT_Threading::get());
+    gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refC.data(), k, n, n);
+    launcher.mProB.unpackWeight(n, k, &packedw, matBf32.data(), n, UT_Threading::get());
+    gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refCupk.data(), k, n, n);
+    auto quanA = launcher.mProA.createStorage(m, k, blocksize, isAsym);
+    utils::avector<int8_t> bufferA(quanA.mSize);
+    quanA.assign(bufferA.data());
+    GemmProblem gp(1, m, n, k, blocksize);
+    typename Launcher::Param args{
+        gp,
+        {matAf32.data(), k, &quanA},
+        {&packedw},
+        {{packedw.template SPtr<char>(), packedw.SDtype(), quanA.template SPtr<float>(), quanA.template ZPtr<uint8_t>(),
+          packedw.template RPtr<char>(), packedw.RDtype(), nullptr, nullptr, k},
+         {matC.data(), n}}};
+    parallel::GemmRunWithA<Parallel>(launcher, args, UT_Threading::get());
+    auto err = get_ut_err(qtype);
+    buffer_error(refC.data(), matC.data(), refC.size(), err);
+
+    if (stype != BTLA_DTYPE::DQ8_BNB) {
+      buffer_error(refCupk.data(), matC.data(), refCupk.size(), INT8_ERR);  // dynamic quant error
+    } else {
+      auto DQ_INT8_ERR = 0.8f;
+      buffer_error(refCupk.data(), matC.data(), refCupk.size(), DQ_INT8_ERR);  // dynamic quant error
+    }
+  }
 };
 #ifdef BTLA_UT_PROLOGUE_B
 static UT_CompInt8 sUT_CompInt8;
@@ -1374,7 +999,7 @@ class UT_CompBf16 {
     auto constexpr ISA = GemmCore_T::ISA;
     using Launcher = wrapper::gemm::LauncherBase<ISA, GemmCore_T, prologue_a::gemm::ActivationBase, Wei,
                                                  epilogue::gemm::AccumulatorWriteBackFp32>;
-    using Parallel = parallel::gemm::SchedulerKBlock<GemmCore_T>;
+    using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
 
     Launcher launcher;
     blocksize = blocksize == -1 ? k : blocksize;
@@ -1446,7 +1071,7 @@ class UT_ORT_NBits {
     using Launcher =
         wrapper::gemm::LauncherBase<ISA, GemmCore_T, prologue_a::gemm::ActivationKBlockBaseF32,
                                     prologue_b::gemm::WeightKBlockNInteger, epilogue::gemm::AccumulatorWriteBackFp32>;
-    using Parallel = parallel::gemm::SchedulerKBlock<GemmCore_T>;
+    using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
     Launcher launcher;
     blocksize = blocksize == -1 ? k : blocksize;
     using WType = storage::gemm::StorageWeightKBlockNInteger;
@@ -1529,7 +1154,7 @@ class UT_ORT_NBits {
     using Launcher =
         wrapper::gemm::LauncherBase<ISA, GemmCore_T, prologue_a::gemm::ActivationKBlockBaseF32,
                                     prologue_b::gemm::WeightKBlockNInteger, epilogue::gemm::AccumulatorWriteBackFp32>;
-    using Parallel = parallel::gemm::SchedulerKBlock<GemmCore_T>;
+    using Parallel = parallel::gemm::SchedulerKBlockS<GemmCore_T>;
     Launcher launcher;
     const char *qfile = "int_weight.bin", *sfile = "scales.bin", *zfile = "zeros.bin";
     auto qdata = ut::readFile2Buffer<int8_t>(qfile);
diff --git a/neural_speed/core/README.md b/neural_speed/core/README.md
index 8dacc6b1d..ed51c884a 100644
--- a/neural_speed/core/README.md
+++ b/neural_speed/core/README.md
@@ -34,7 +34,7 @@ fp4 | | multiplier of 8
 nf4 | | multiplier of 8
 
 <sup>1</sup>: group size=-1 means per channel quantization on output channel (or group size equals to input channel size).  
-<sup>2</sup>: int7 + asymmetric may cause numeric overflow if the device only has AVX2 without AVX_VNNI or uses AVX512BW to compute.
+<sup>2</sup>: int7 + asymmetric may cause numeric overflow if the device only has AVX2 without AVX_VNNI or uses AVX512BW to compute.  
 <sup>3</sup>: It may cause numeric overflow if the device only has AVX2 without AVX_VNNI or uses AVX512BW to compute.
 
 NOTE:
diff --git a/neural_speed/core/layers/bestla_common.hpp b/neural_speed/core/layers/bestla_common.hpp
index 544a7da5a..c830e101e 100644
--- a/neural_speed/core/layers/bestla_common.hpp
+++ b/neural_speed/core/layers/bestla_common.hpp
@@ -122,85 +122,92 @@ struct ParamAdd {
   _T *C, *D;
   int ldc, ldd;
 };
-template <BTLA_ISA ISA_T, typename _T>
+template <typename _T>
 class Add {
  public:
   using Param = ParamAdd<_T>;
 
-  static BTLA_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset,
-                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset,
+                                  const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
     auto DOffset = M_offset * _param.ldd + N_offset;
     auto cptr = _param.C + COffset;
     auto dptr = _param.D + DOffset;
-    // for (int i = 0; i < M; i++) {
-    //   ne_vec_add_f32(N, cptr + i * _param.ldc,dptr + i * _param.ldd, cacheptr + i * cachestep);
-    // }
-    for (int i = 0; i < M; i++) {
-      for (int j = 0; j < N; j++) {
-        cptr[i * _param.ldc + j] = dptr[i * _param.ldd + j] + cacheptr[i * cachestep + j];
+    if constexpr (std::is_same_v<_T, float>) {
+      for (int i = 0; i < M; i++) {
+        bestla::kernel::wrapper::Add<_T>::template forward<ISA_T>(dptr + i * _param.ldd, cacheptr + i * cachestep,
+                                                                  cptr + i * _param.ldc, N);
       }
+      return BTLA_CODE::Success;
+    } else {
+      return BTLA_CODE::NotSupport;
     }
-    return BTLA_CODE::Success;
   }
 };
-template <BTLA_ISA ISA_T>
-using AddFp32 = Add<ISA_T, float>;
+using AddFp32 = Add<float>;
 
 template <typename _T>
 struct ParamMul {
   _T *C, *D;
   int ldc, ldd;
 };
-template <BTLA_ISA ISA_T, typename _T>
+template <typename _T>
 class Mul {
  public:
   using Param = ParamMul<_T>;
-  static BTLA_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset,
-                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset,
+                                  const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
     auto DOffset = M_offset * _param.ldd + N_offset;
     auto cptr = _param.C + COffset;
     auto dptr = _param.D + DOffset;
-    for (int i = 0; i < M; i++) {
-      for (int j = 0; j < N; j++) {
-        cptr[i * _param.ldc + j] = dptr[i * _param.ldd + j] * cacheptr[i * cachestep + j];
+    if constexpr (std::is_same_v<_T, float>) {
+      for (int i = 0; i < M; i++) {
+        bestla::kernel::wrapper::Mul<_T>::template forward<ISA_T>(dptr + i * _param.ldd, cacheptr + i * cachestep,
+                                                                  cptr + i * _param.ldc, N);
       }
+      return BTLA_CODE::Success;
+    } else {
+      return BTLA_CODE::NotSupport;
     }
-    return BTLA_CODE::Success;
   }
 };
-template <BTLA_ISA ISA_T>
-using MulFp32 = Mul<ISA_T, float>;
+using MulFp32 = Mul<float>;
 
 template <typename _T>
 struct ParamAdd_Gelu {
   _T *C, *D;
   int ldc, ldd;
 };
-template <BTLA_ISA ISA_T, typename _T>
+template <typename _T>
 class Add_Gelu {
  public:
   using Param = ParamAdd_Gelu<_T>;
-  static BTLA_CODE forward(  // NOLINT [build/include_what_you_use]
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward(  // NOLINT [build/include_what_you_use]
       const float* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M, const int N,
       const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
     auto DOffset = M_offset * _param.ldd + N_offset;
     auto cptr = _param.C + COffset;
     auto dptr = _param.D + DOffset;
-    for (int i = 0; i < M; i++) {
-      ne_vec_add_f32(N, cptr + i * _param.ldc, dptr + i * _param.ldd, cacheptr + i * cachestep);
+    if constexpr (std::is_same_v<_T, float>) {
+      for (int i = 0; i < M; i++) {
+        bestla::kernel::wrapper::Add<_T>::template forward<ISA_T>(dptr + i * _param.ldd, cacheptr + i * cachestep,
+                                                                  cptr + i * _param.ldc, N);
+      }
+      using GeluKernel = bestla::epilogue::gemm::AccumulatorWriteBackWithGeluFp32;
+      typename GeluKernel::Param param{_param.C, _param.ldc, nullptr};
+      auto ret = GeluKernel::forward<ISA_T>(cptr, _param.ldc, M_offset, N_offset, M, N, param, tmpcache, cachesize);
+      return ret;
+    } else {
+      return BTLA_CODE::NotSupport;
     }
-    using GeluKernel = bestla::epilogue::gemm::AccumulatorWriteBackWithGeluFp32<ISA_T>;
-    static GeluKernel ker;
-    typename GeluKernel::Param param{_param.C, _param.ldc, nullptr};
-    auto ret = ker.forward(cptr, _param.ldc, M_offset, N_offset, M, N, param, tmpcache, cachesize);
-    return ret;
   }
 };
-template <BTLA_ISA ISA_T>
-using Add_GeluFp32 = Add_Gelu<ISA_T, float>;
+using Add_GeluFp32 = Add_Gelu<float>;
 
 }  // namespace epilogue
 }  // namespace custom
diff --git a/neural_speed/core/layers/bestla_defs.h b/neural_speed/core/layers/bestla_defs.h
index 4212b948b..f24615921 100644
--- a/neural_speed/core/layers/bestla_defs.h
+++ b/neural_speed/core/layers/bestla_defs.h
@@ -27,23 +27,31 @@ using tLauncher_Int8_F32F32 =
     wrapper::gemm::LauncherIntKBlock<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ShuffleActivationKBlockQuantizeF32,
                                      Wei_T, epilogue::gemm::AccumulatorWriteBackFp32>;
 
+using PcWriteBackF32 = epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue::gemm::AccumulatorWriteBackFp32>;
+
+template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T>
+using tLauncher_Int8Pc_F32F32 =
+    wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ShuffleActivationKBlockQuantizeF32,
+                                Wei_T, PcWriteBackF32>;
+
 using tAVX2 = gemm::SCoreRowNAvx2<24, 4>;
+using tAVX2_VNNI = gemm::ICoreRowNAvx2vnni<24, 4>;
 using tAVX_VNNI = gemm::ICoreRowNAvxvnni<24, 4>;
 using tAVX512F = gemm::SCoreRowNAvx512f<48, 8>;
 using tAVX512BW = gemm::ICoreRowNAvx512bw<48, 8>;
 using tAVX512_VNNI = gemm::ICoreRowNAvx512vnni<48, 8>;
-using tAMX_BF16 = gemm::HCoreRowNAmxbf16<48, 16>;
-using tAVX512_BF16 = gemm::HCoreRowNAvx512bf16<48, 8>;
+using tAMX_BF16 = gemm::HCoreRowNAmxbf16<64, 16>;
+using tAVX512_BF16 = gemm::HCoreRowNAvx512bf16<64, 4>;
 using tAVX512_FP16 = gemm::HCoreRowNAvx512fp16<96, 8>;
-using tAMX_INT8_US = gemm::ICoreRowNAmxint8<48, 16>;
-using tAMX_INT8_SS = gemm::ICoreRowNAmxint8SS<48, 16>;
+using tAMX_INT8_US = gemm::ICoreRowNAmxint8<64, 16>;
+using tAMX_INT8_SS = gemm::ICoreRowNAmxint8SS<64, 16>;
 
 using tAVX2_VNNI_KBlock = gemm::ICoreRowNAvx2vnniKBlock<24, 2>;
 using tAVX_VNNI_KBlock = gemm::ICoreRowNAvxvnniKBlock<24, 2>;
 using tAVX512BW_KBlock = gemm::ICoreRowNAvx512bwKBlock<48, 8>;
 using tAVX512_VNNI_KBlock = gemm::ICoreRowNAvx512vnniKBlock<48, 4>;
-using tAMX_INT8_US_KBlock = gemm::ICoreRowNAmxint8KBlock<48, 16>;
-using tAMX_INT8_SS_KBlock = gemm::ICoreRowNAmxint8SSKBlock<48, 16>;
+using tAMX_INT8_US_KBlock = gemm::ICoreRowNAmxint8KBlock<64, 16>;
+using tAMX_INT8_SS_KBlock = gemm::ICoreRowNAmxint8SSKBlock<64, 16>;
 
 template <class GC_T, BTLA_ISA ISA_T>
 using tWeiNInt = prologue_b::gemm::WeightKBlockNInteger<GC_T, ISA_T>;
diff --git a/neural_speed/core/layers/bestla_gemm.cpp b/neural_speed/core/layers/bestla_gemm.cpp
index e0f90b53e..dca8f4034 100644
--- a/neural_speed/core/layers/bestla_gemm.cpp
+++ b/neural_speed/core/layers/bestla_gemm.cpp
@@ -52,6 +52,38 @@ void BTLAGemmCompF32(const int M, const int N, const int K, const float* A, cons
   }
 }
 
+template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T>
+void BTLAGemmCompInt8Pc(const int M, const int N, const int K, const float* A, const int lda,
+                        storage::gemm::IWeightBase* _B, float* C, const int ldc, int8_t* WorkSpace,
+                        parallel::IThreading* th) {
+  using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
+  using Launcher =
+      wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ShuffleActivationKBlockQuantizeF32,
+                                  Wei_T, PcWriteBackF32>;
+  auto B = reinterpret_cast<typename Launcher::PrologueB::StorageWeight*>(_B);
+  assert(B->mBlockSize >= K);
+  utils::GemmProblem gp(1, M, N, K, B->mBlockSize);
+  static Launcher kernel;
+  auto quanA = kernel.mProA.createQuantStorage(M, K, B->mBlockSize, B->IsAsym());
+  quanA.assign(WorkSpace);
+  WorkSpace += quanA.mSize;
+  auto reordA = kernel.mProA.createReorderStorage(M, K, B->mBlockSize);
+  typename Launcher::Param args{
+      gp,
+      {A, K, &quanA, B->ShfIndice(), &reordA},
+      {B},
+      {{B->template SPtr<char>(), B->SDtype(), quanA.template SPtr<float>(), quanA.template ZPtr<uint8_t>(),
+        B->template RPtr<char>(), B->RDtype(), nullptr, nullptr, K},
+       {C, N}}};
+  if (B->ShfIndice()) {
+    reordA.assign(WorkSpace);
+    kernel.mProA.quantize({A, K, &quanA, B->ShfIndice(), &reordA}, M, K, th);
+    parallel::GemmRun<Parallel>(kernel, args, th);
+  } else {
+    parallel::GemmRunWithA<Parallel>(kernel, args, th);
+  }
+}
+
 template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T>
 void BTLAGemmCompInt8(const int M, const int N, const int K, const float* A, const int lda,
                       storage::gemm::IWeightBase* _B, float* C, const int ldc, int8_t* WorkSpace,
@@ -113,22 +145,49 @@ bool BTLAGemmBatchDriver(const size_t M, const size_t N, const size_t K, const s
           }
         }
         if (btype == gemm::CompType::tS8 && PackRow == 4) {
-          if (NTile == tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
-            BTLAGemmCompInt8<tAMX_INT8_SS_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
-                                                            DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+          if (NTile == tAMX_INT8_US_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_US_KBlock::KTILE == 0) {
+            if (bptr->mBlockSize < K) {
+              BTLAGemmCompInt8<tAMX_INT8_US_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                              DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            } else {
+              BTLAGemmCompInt8Pc<tAMX_INT8_US, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                         DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            }
+
           } else if (NTile == tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
                      BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
-            BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
-                                                            DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
-          } else if (NTile == tAVX512BW_KBlock::NTILE && _cd->AVX512BW() && BlkSize % tAVX512BW_KBlock::KTILE == 0) {
-            BTLAGemmCompInt8<tAVX512BW_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+            if (bptr->mBlockSize < K) {
+              BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                              DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            } else {
+              BTLAGemmCompInt8Pc<tAVX512_VNNI, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
                                                          DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            }
+          } else if (NTile == tAVX512BW_KBlock::NTILE && _cd->AVX512BW() && BlkSize % tAVX512BW_KBlock::KTILE == 0) {
+            if (bptr->mBlockSize < K) {
+              BTLAGemmCompInt8<tAVX512BW_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                           DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            } else {
+              BTLAGemmCompInt8Pc<tAVX512BW, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr, DataParams[i].C,
+                                                      DataParams[i].ldc, WorkSpace, pth);
+            }
+
           } else if (NTile == tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
-            BTLAGemmCompInt8<tAVX_VNNI_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
-                                                         DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            if (bptr->mBlockSize < K) {
+              BTLAGemmCompInt8<tAVX_VNNI_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                           DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            } else {
+              BTLAGemmCompInt8Pc<tAVX_VNNI, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr, DataParams[i].C,
+                                                      DataParams[i].ldc, WorkSpace, pth);
+            }
           } else if (NTile == tAVX2_VNNI_KBlock::NTILE && _cd->AVX2() && BlkSize % tAVX2_VNNI_KBlock::KTILE == 0) {
-            BTLAGemmCompInt8<tAVX2_VNNI_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
-                                                          DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            if (bptr->mBlockSize < K) {
+              BTLAGemmCompInt8<tAVX2_VNNI_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                            DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            } else {
+              BTLAGemmCompInt8Pc<tAVX2_VNNI, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                       DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            }
           }
         }
       }
@@ -196,8 +255,8 @@ size_t BTLAGemmPackBSizeLocal(size_t N, size_t K, size_t BlkSize, BTLA_DTYPE Qua
   switch (CompType) {
     case NE_COMP_INT8:
       if (dtype_type == dtype_int && !(QuantType == BTLA_DTYPE::S8 && isAsym)) {
-        if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
-          return BTLABuSize<tLauncher_Int8_F32F32<tAMX_INT8_SS_KBlock, Wei_T>>(
+        if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_US_KBlock::KTILE == 0) {
+          return BTLABuSize<tLauncher_Int8_F32F32<tAMX_INT8_US_KBlock, Wei_T>>(
               static_cast<int>(BlkSize), N, K, QuantType, ScaleDtype, isAsym, shuffle_indice);
         }
         if (_cd->AVX512_VNNI() && BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
@@ -242,21 +301,6 @@ size_t BTLAGemmPackBSizeLocal(size_t N, size_t K, size_t BlkSize, BTLA_DTYPE Qua
   return 0;
 }
 
-size_t BTLAGemmPackBSize(size_t N, size_t K, size_t BlkSize, BTLA_DTYPE QuantType, BTLA_DTYPE ScaleDtype, bool isAsym,
-                         ne_comp_type CompType, int* shuffle_indice) {
-  auto qtype = utils::bestla_dtype_type(QuantType);
-  if (qtype == utils::bestla_dtype_type(BTLA_DTYPE::TypeInt)) {
-    return BTLAGemmPackBSizeLocal<prologue_b::gemm::WeightKBlockNInteger>(N, K, BlkSize, QuantType, ScaleDtype, isAsym,
-                                                                          CompType, shuffle_indice);
-  } else if (qtype == utils::bestla_dtype_type(BTLA_DTYPE::TypeFloat)) {
-    return BTLAGemmPackBSizeLocal<prologue_b::gemm::WeightKBlockNFloat>(N, K, BlkSize, QuantType, ScaleDtype, isAsym,
-                                                                        CompType, shuffle_indice);
-  } else {
-    assert(0);
-  }
-  return 0;
-}
-
 template <typename T>
 void BTLAGemmQuantPackB(void* PackedBuf, int BlkSize, const float* FpData, int N, int K, BTLA_DTYPE QuantType,
                         BTLA_DTYPE ScaleDtype, bool IsAsym, int ldb, bool IsTrans, void* ThreadPool) {
@@ -288,8 +332,8 @@ bool BTLAGemmQuantPackBLocal(void* PackedBuf, const float* FpData, size_t N, siz
   switch (CompType) {
     case NE_COMP_INT8:
       if (dtype_type == dtype_int && !(QuantType == BTLA_DTYPE::S8 && isAsym)) {
-        if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
-          BTLAGemmQuantPackB<tLauncher_Int8_F32F32<tAMX_INT8_SS_KBlock, Wei_T>>(
+        if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_US_KBlock::KTILE == 0) {
+          BTLAGemmQuantPackB<tLauncher_Int8_F32F32<tAMX_INT8_US_KBlock, Wei_T>>(
               PackedBuf, static_cast<int>(BlkSize), FpData, static_cast<int>(N), static_cast<int>(K), QuantType,
               ScaleDtype, isAsym, static_cast<int>(ldb), isTrans, ThreadPool);
           return true;
@@ -350,22 +394,6 @@ bool BTLAGemmQuantPackBLocal(void* PackedBuf, const float* FpData, size_t N, siz
   return false;
 }
 
-bool BTLAGemmQuantPackB(void* PackedBuf, const float* FpData, size_t N, size_t K, size_t ldb, size_t BlkSize,
-                        BTLA_DTYPE QuantType, BTLA_DTYPE ScaleDtype, bool isAsym, ne_comp_type CompType, bool isTrans,
-                        void* ThreadPool) {
-  auto qtype = utils::bestla_dtype_type(QuantType);
-  if (qtype == utils::bestla_dtype_type(BTLA_DTYPE::TypeInt)) {
-    return BTLAGemmQuantPackBLocal<prologue_b::gemm::WeightKBlockNInteger>(
-        PackedBuf, FpData, N, K, ldb, BlkSize, QuantType, ScaleDtype, isAsym, CompType, isTrans, ThreadPool);
-  } else if (qtype == utils::bestla_dtype_type(BTLA_DTYPE::TypeFloat)) {
-    return BTLAGemmQuantPackBLocal<prologue_b::gemm::WeightKBlockNFloat>(
-        PackedBuf, FpData, N, K, ldb, BlkSize, QuantType, ScaleDtype, isAsym, CompType, isTrans, ThreadPool);
-  } else {
-    assert(0);
-    return false;
-  }
-}
-
 template <typename T>
 void BTLAGemmPackBImpl(void* PackedBuf, int BlkSize, const int8_t* QData, const float* Scales, const int8_t* Zp, int N,
                        int K, BTLA_DTYPE QuantType, BTLA_DTYPE ScaleDtype, bool IsAsym, int ldb, int* shuffle_indice,
@@ -405,8 +433,8 @@ bool BTLAGemmPackBLocal(void* PackedBuf, const int8_t* QData, const float* Scale
   switch (CompType) {
     case NE_COMP_INT8:
       if (dtype_type == dtype_int) {
-        if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
-          BTLAGemmPackBImpl<tLauncher_Int8_F32F32<tAMX_INT8_SS_KBlock, Wei_T>>(
+        if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_US_KBlock::KTILE == 0) {
+          BTLAGemmPackBImpl<tLauncher_Int8_F32F32<tAMX_INT8_US_KBlock, Wei_T>>(
               PackedBuf, static_cast<int>(BlkSize), QData, Scales, Zp, static_cast<int>(N), static_cast<int>(K),
               QuantType, ScaleDtype, isAsym, static_cast<int>(ldb), shuffle_indice, ThreadPool);
           return true;
@@ -472,7 +500,7 @@ bool BTLAGemmPackBLocal(void* PackedBuf, const int8_t* QData, const float* Scale
 bool BTLAGemmBatchDriver(const size_t M, const size_t N, const size_t K, const size_t BatchN,
                          const BTLA_GEMM_DATA_PACKED_PARAMS* DataParams, int8_t* WorkSpace, void* ThreadPool) {
   GetCPUDevice();
-  auto pth = reinterpret_cast<bestla::parallel::IThreading*>(ThreadPool);
+  auto pth = reinterpret_cast<parallel::IThreading*>(ThreadPool);
   bool processed = true;
   for (size_t i = 0; i < BatchN; i++) {
     auto ptr = storage::gemm::PackedWeightParser::deserialBuffer(const_cast<void*>(DataParams[i].B));
@@ -507,24 +535,49 @@ bool BTLAGemmBatchDriver(const size_t M, const size_t N, const size_t K, const s
           }
         }
         if (btype == gemm::CompType::tS8 && PackRow == 4) {
-          // Do we need US for AMX_INT8
-          if (NTile == tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
-            BTLAGemmCompInt8<tAMX_INT8_SS_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
-                                                            DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+          if (NTile == tAMX_INT8_US_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_US_KBlock::KTILE == 0) {
+            if (bptr->mBlockSize < K) {
+              BTLAGemmCompInt8<tAMX_INT8_US_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                              DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            } else {
+              BTLAGemmCompInt8Pc<tAMX_INT8_US, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                         DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            }
 
           } else if (NTile == tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
                      BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
-            BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
-                                                            DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
-          } else if (NTile == tAVX512BW_KBlock::NTILE && _cd->AVX512BW() && BlkSize % tAVX512BW_KBlock::KTILE == 0) {
-            BTLAGemmCompInt8<tAVX512BW_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+            if (bptr->mBlockSize < K) {
+              BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                              DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            } else {
+              BTLAGemmCompInt8Pc<tAVX512_VNNI, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
                                                          DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            }
+          } else if (NTile == tAVX512BW_KBlock::NTILE && _cd->AVX512BW() && BlkSize % tAVX512BW_KBlock::KTILE == 0) {
+            if (bptr->mBlockSize < K) {
+              BTLAGemmCompInt8<tAVX512BW_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                           DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            } else {
+              BTLAGemmCompInt8Pc<tAVX512BW, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr, DataParams[i].C,
+                                                      DataParams[i].ldc, WorkSpace, pth);
+            }
+
           } else if (NTile == tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
-            BTLAGemmCompInt8<tAVX_VNNI_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
-                                                         DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            if (bptr->mBlockSize < K) {
+              BTLAGemmCompInt8<tAVX_VNNI_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                           DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            } else {
+              BTLAGemmCompInt8Pc<tAVX_VNNI, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr, DataParams[i].C,
+                                                      DataParams[i].ldc, WorkSpace, pth);
+            }
           } else if (NTile == tAVX2_VNNI_KBlock::NTILE && _cd->AVX2() && BlkSize % tAVX2_VNNI_KBlock::KTILE == 0) {
-            BTLAGemmCompInt8<tAVX2_VNNI_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
-                                                          DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            if (bptr->mBlockSize < K) {
+              BTLAGemmCompInt8<tAVX2_VNNI_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                            DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            } else {
+              BTLAGemmCompInt8Pc<tAVX2_VNNI, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                       DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            }
           }
         }
       }
@@ -589,8 +642,8 @@ bool BTLAGemmQuantPackB(void* PackedBuf, const float* FpData, size_t N, size_t K
         PackedBuf, FpData, N, K, ldb, BlkSize, QuantType, ScaleDtype, isAsym, CompType, isTrans, ThreadPool);
   } else {
     assert(0);
+    return false;
   }
-  return false;
 }
 
 bool BTLAGemmPackB(void* PackedBuf, const int8_t* QData, const float* Scales, const int8_t* Zp, size_t N, size_t K,
@@ -630,8 +683,8 @@ bool BTLAGemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, s
         }
       }
       if (btype == gemm::CompType::tS8 && PackRow == 4) {
-        if (NTile == tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8()) {
-          static prologue_b::gemm::WeightKBlockNInteger<tAMX_INT8_SS_KBlock, tAMX_INT8_SS_KBlock::ISA> proB;
+        if (NTile == tAMX_INT8_US_KBlock::NTILE && _cd->AMX_INT8()) {
+          static prologue_b::gemm::WeightKBlockNInteger<tAMX_INT8_US_KBlock, tAMX_INT8_US_KBlock::ISA> proB;
           proB.unpackWeight(static_cast<int>(N), static_cast<int>(K), sptr, FpData, static_cast<int>(ldb), pth);
         } else if (NTile == tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI()) {
           static prologue_b::gemm::WeightKBlockNInteger<tAVX512_VNNI_KBlock, tAVX512_VNNI_KBlock::ISA> proB;
@@ -682,33 +735,23 @@ bool BTLALayerNorm(size_t norm_count, size_t norm_size, bool isrms, float epsilo
   auto pth = reinterpret_cast<parallel::IThreading*>(ThreadPool);
   int threads = inorm_count <= 4 ? 1 : pth->num_threads();
   parallel::Scheduler2D sch({threads, inorm_count, inorm_size, 1, inorm_size});
+  auto threadfunc = [&](int tidx) {
+    parallel::ThreadProblem2D tp{tidx};
+    sch.getIndex(tp);
+    if (tp.valid) {
+      for (size_t i = 0; i < tp.size[0]; i++) {
+        auto srcptr = FpIn + (tp.loc[0] + i) * inorm_size;
+        auto dstptr = FpOut + (tp.loc[0] + i) * inorm_size;
+        auto ret = kernel::wrapper::LayerNormalization::forward_auto<float>(
+            srcptr, nullptr, nullptr, epsilon, inorm_size, dstptr, nullptr, nullptr, isrms);
+      }
+    }
+  };
   if (threads == 1) {
     parallel::SingleThread st;
-    st.parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D tp{tidx};
-      sch.getIndex(tp);
-      if (tp.valid) {
-        for (size_t i = 0; i < tp.size[0]; i++) {
-          auto srcptr = FpIn + (tp.loc[0] + i) * inorm_size;
-          auto dstptr = FpOut + (tp.loc[0] + i) * inorm_size;
-          auto ret = kernel::wrapper::LayerNormalization::forward_auto<float>(
-              srcptr, nullptr, nullptr, epsilon, inorm_size, dstptr, nullptr, nullptr, isrms);
-        }
-      }
-    });
+    st.parallel_for(threadfunc);
   } else {
-    pth->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D tp{tidx};
-      sch.getIndex(tp);
-      if (tp.valid) {
-        for (size_t i = 0; i < tp.size[0]; i++) {
-          auto srcptr = FpIn + (tp.loc[0] + i) * inorm_size;
-          auto dstptr = FpOut + (tp.loc[0] + i) * inorm_size;
-          auto ret = kernel::wrapper::LayerNormalization::forward_auto<float>(
-              srcptr, nullptr, nullptr, epsilon, inorm_size, dstptr, nullptr, nullptr, isrms);
-        }
-      }
-    });
+    pth->parallel_for(threadfunc);
   }
   return true;
 }
diff --git a/neural_speed/core/layers/inner_product.cpp b/neural_speed/core/layers/inner_product.cpp
index c421afd52..99c36619f 100644
--- a/neural_speed/core/layers/inner_product.cpp
+++ b/neural_speed/core/layers/inner_product.cpp
@@ -56,6 +56,37 @@ void BTLAGemmCompF32(const int M, const int N, const int K, const float* A, cons
   }
 }
 
+template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T>
+void BTLAGemmCompInt8Pc(const int M, const int N, const int K, const float* A, const int lda,
+                        storage::gemm::IWeightBase* _B, float* C, const int ldc, float* bias, bool broadcast_bias,
+                        int8_t* WorkSpace, parallel::IThreading* th) {
+  using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
+  using Launcher =
+      wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ShuffleActivationKBlockQuantizeF32,
+                                  Wei_T, epilogue::gemm::PcKBlockCompInt8Epilogue<custom::epilogue::AddFp32>>;
+
+  auto B = reinterpret_cast<typename Launcher::PrologueB::StorageWeight*>(_B);
+  utils::GemmProblem gp(1, M, N, K, B->mBlockSize);
+  static Launcher kernel;
+  auto quanA = kernel.mProA.createQuantStorage(M, K, B->mBlockSize, B->IsAsym());
+  quanA.assign(WorkSpace);
+  WorkSpace += quanA.mSize;
+  auto reordA = kernel.mProA.createReorderStorage(M, K, B->mBlockSize);
+  typename Launcher::Param args{
+      gp,
+      {A, lda, &quanA, B->ShfIndice(), &reordA},
+      {B},
+      {{B->template SPtr<char>(), B->SDtype(), quanA.template SPtr<float>(), quanA.template ZPtr<uint8_t>(),
+        B->template RPtr<char>(), B->RDtype(), nullptr, nullptr, K},
+       {C, bias, ldc, broadcast_bias ? 0 : ldc}}};
+  if (B->ShfIndice()) {
+    reordA.assign(WorkSpace);
+    kernel.mProA.quantize({A, lda, &quanA, B->ShfIndice(), &reordA}, M, K, th);
+    parallel::GemmRun<Parallel>(kernel, args, th);
+  } else {
+    parallel::GemmRunWithA<Parallel>(kernel, args, th);
+  }
+}
 template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T>
 void BTLAGemmCompInt8(const int M, const int N, const int K, const float* A, const int lda,
                       storage::gemm::IWeightBase* _B, float* C, const int ldc, float* bias, bool broadcast_bias,
@@ -140,23 +171,48 @@ void bestla_fusion_add_f32f32_forward(float* activation, void* weiptr, float* bi
         }
       }
       if (btype == gemm::CompType::tS8 && PackRow == 4) {
-        if (NTile == tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
-          ip_add::BTLAGemmCompInt8<tAMX_INT8_SS_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
-                                                                  broadcast_bias, workspace, pth);
+        if (NTile == tAMX_INT8_US_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_US_KBlock::KTILE == 0) {
+          if (BlkSize < _k) {
+            ip_add::BTLAGemmCompInt8<tAMX_INT8_US_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
+                                                                    broadcast_bias, workspace, pth);
+          } else {
+            ip_add::BTLAGemmCompInt8Pc<tAMX_INT8_US, tWeiNInt>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
+                                                               broadcast_bias, workspace, pth);
+          }
 
         } else if (NTile == tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
                    BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
-          ip_add::BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
-                                                                  broadcast_bias, workspace, pth);
-        } else if (NTile == tAVX512BW_KBlock::NTILE && _cd->AVX512BW() && BlkSize % tAVX512BW_KBlock::KTILE == 0) {
-          ip_add::BTLAGemmCompInt8<tAVX512BW_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
+          if (BlkSize < _k) {
+            ip_add::BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
+                                                                    broadcast_bias, workspace, pth);
+          } else {
+            ip_add::BTLAGemmCompInt8Pc<tAVX512_VNNI, tWeiNInt>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
                                                                broadcast_bias, workspace, pth);
+          }
+        } else if (NTile == tAVX512BW_KBlock::NTILE && _cd->AVX512BW() && BlkSize % tAVX512BW_KBlock::KTILE == 0) {
+          if (BlkSize < _k) {
+            ip_add::BTLAGemmCompInt8<tAVX512BW_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
+                                                                 broadcast_bias, workspace, pth);
+          } else {
+            ip_add::BTLAGemmCompInt8Pc<tAVX512BW, tWeiNInt>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
+                                                            broadcast_bias, workspace, pth);
+          }
         } else if (NTile == tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
-          ip_add::BTLAGemmCompInt8<tAVX_VNNI_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
-                                                               broadcast_bias, workspace, pth);
+          if (BlkSize < _k) {
+            ip_add::BTLAGemmCompInt8<tAVX_VNNI_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
+                                                                 broadcast_bias, workspace, pth);
+          } else {
+            ip_add::BTLAGemmCompInt8Pc<tAVX_VNNI, tWeiNInt>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
+                                                            broadcast_bias, workspace, pth);
+          }
         } else if (NTile == tAVX2_VNNI_KBlock::NTILE && _cd->AVX2() && BlkSize % tAVX2_VNNI_KBlock::KTILE == 0) {
-          ip_add::BTLAGemmCompInt8<tAVX2_VNNI_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
-                                                                broadcast_bias, workspace, pth);
+          if (BlkSize < _k) {
+            ip_add::BTLAGemmCompInt8<tAVX2_VNNI_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
+                                                                  broadcast_bias, workspace, pth);
+          } else {
+            ip_add::BTLAGemmCompInt8Pc<tAVX2_VNNI, tWeiNInt>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
+                                                             broadcast_bias, workspace, pth);
+          }
         }
       }
     }
diff --git a/neural_speed/core/layers/ip_fusion_ffn.cpp b/neural_speed/core/layers/ip_fusion_ffn.cpp
index 913ca13ab..8a1d20433 100644
--- a/neural_speed/core/layers/ip_fusion_ffn.cpp
+++ b/neural_speed/core/layers/ip_fusion_ffn.cpp
@@ -99,11 +99,10 @@ void GemmRun_ffn(Launch_T1* launcher1, Launch_T2* launcher2, const typename Laun
 }
 
 template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T, template <class, BTLA_ISA> class Act_T,
-          template <BTLA_ISA> class Epi_T1, template <BTLA_ISA> class Epi_T2>
+          class Epi_T1, class Epi_T2>
 void BTLAGemmCompF32(float* activation, storage::gemm::IWeightBase* w1ptr, storage::gemm::IWeightBase* w2ptr,
                      float* tmp, float* output, int seq, int fin, int fmid, int fout, void* workspace,
-                     parallel::IThreading* th, typename Epi_T1<GemmCore_T::ISA>::Param epi_prama1,
-                     typename Epi_T2<GemmCore_T::ISA>::Param epi_prama2) {
+                     parallel::IThreading* th, typename Epi_T1::Param epi_prama1, typename Epi_T2::Param epi_prama2) {
   using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
   using Launcher_epi = wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, Act_T, Wei_T, Epi_T1>;
   using Launcher = wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, Act_T, Wei_T, Epi_T2>;
@@ -128,12 +127,62 @@ void BTLAGemmCompF32(float* activation, storage::gemm::IWeightBase* w1ptr, stora
   }
 }
 
-template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T, template <BTLA_ISA> class Epi_T1,
-          template <BTLA_ISA> class Epi_T2>
+template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T, class Epi_T1, class Epi_T2>
+void BTLAGemmCompInt8Pc(float* activation, storage::gemm::IWeightBase* w1ptr, storage::gemm::IWeightBase* w2ptr,
+                        float* tmp, float* output, int seq, int fin, int fmid, int fout, void* workspace,
+                        parallel::IThreading* th, typename Epi_T1::Fp32Param epi_prama1,
+                        typename Epi_T2::Fp32Param epi_prama2) {
+  using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
+
+  using Launcher_epi = wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T,
+                                                   prologue_a::gemm::ShuffleActivationKBlockQuantizeF32, Wei_T, Epi_T1>;
+  using Launcher = wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T,
+                                               prologue_a::gemm::ShuffleActivationKBlockQuantizeF32, Wei_T, Epi_T2>;
+  auto w1ptr_ = reinterpret_cast<typename Launcher_epi::PrologueB::StorageWeight*>(w1ptr);
+  auto w2ptr_ = reinterpret_cast<typename Launcher::PrologueB::StorageWeight*>(w2ptr);
+  utils::GemmProblem gp1(1, seq, fmid, fin, w1ptr_->mBlockSize);
+  utils::GemmProblem gp2(1, seq, fout, fmid, w2ptr_->mBlockSize);
+  assert(w1ptr_->mBlockSize >= fin);
+  assert(w2ptr_->mBlockSize >= fmid);
+  static Launcher_epi kernel_epi;
+  static Launcher kernel;
+  auto WS = reinterpret_cast<int8_t*>(workspace);
+  auto quanA1 = kernel_epi.mProA.createStorage(seq, fin, w1ptr_->mBlockSize, w1ptr_->IsAsym());
+  quanA1.assign(WS);
+  WS += quanA1.mSize;
+  auto reordA1 = kernel_epi.mProA.createReorderStorage(seq, fin, w1ptr_->mBlockSize);
+  if (w1ptr_->ShfIndice()) {
+    reordA1.assign(WS);
+  }
+  WS = reinterpret_cast<int8_t*>(workspace);
+  auto quanA2 = kernel.mProA.createStorage(seq, fmid, w2ptr_->mBlockSize, w2ptr_->IsAsym());
+  quanA2.assign(WS);
+  WS += quanA2.mSize;
+  auto reordA2 = kernel_epi.mProA.createReorderStorage(seq, fin, w2ptr_->mBlockSize);
+  if (w2ptr_->ShfIndice()) {
+    reordA2.assign(WS);
+  }
+  typename Launcher_epi::Param args1{
+      gp1,
+      {activation, fin, &quanA1, w1ptr_->ShfIndice(), &reordA1},
+      {w1ptr_},
+      {{w1ptr_->template SPtr<char>(), w1ptr_->SDtype(), quanA1.template SPtr<float>(), quanA1.template ZPtr<uint8_t>(),
+        w1ptr_->template RPtr<char>(), w1ptr_->RDtype(), nullptr, nullptr, fin},
+       epi_prama1}};
+  typename Launcher::Param args2{
+      gp2,
+      {tmp, fmid, &quanA2, w2ptr_->ShfIndice(), &reordA2},
+      {w2ptr_},
+      {{w2ptr_->template SPtr<char>(), w2ptr_->SDtype(), quanA2.template SPtr<float>(), quanA2.template ZPtr<uint8_t>(),
+        w2ptr_->template RPtr<char>(), w2ptr_->RDtype(), nullptr, nullptr, fmid},
+       epi_prama2}};
+  GemmRunWithA_ffn<Parallel>(&kernel_epi, &kernel, args1, args2, th);
+}
+
+template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T, class Epi_T1, class Epi_T2>
 void BTLAGemmCompInt8(float* activation, storage::gemm::IWeightBase* w1ptr, storage::gemm::IWeightBase* w2ptr,
                       float* tmp, float* output, int seq, int fin, int fmid, int fout, void* workspace,
-                      parallel::IThreading* th, typename Epi_T1<GemmCore_T::ISA>::Param epi_prama1,
-                      typename Epi_T2<GemmCore_T::ISA>::Param epi_prama2) {
+                      parallel::IThreading* th, typename Epi_T1::Param epi_prama1, typename Epi_T2::Param epi_prama2) {
   using Parallel = parallel::gemm::SchedulerKBlockS<GemmCore_T>;
   using Launcher_epi =
       wrapper::gemm::LauncherIntKBlock<GemmCore_T::ISA, GemmCore_T,
@@ -193,8 +242,7 @@ bool bestla_fusion_ffn_f32f32_support(void* w1ptr, void* w2ptr, int seq, int fin
   return support;
 }
 
-template <template <BTLA_ISA> class epilogue1, template <BTLA_ISA> class epilogue2, typename Epi_args1,
-          typename Epi_args2>
+template <class epilogue1, class epilogue2, typename Epi_args1, typename Epi_args2>
 void bestla_fusion_ffn_f32f32_forward(float* activation, void* w1ptr, void* w2ptr, float* tmp, float* output, int seq,
                                       int fin, int fmid, int fout, void* workspace, Epi_args1 epi_args1,
                                       Epi_args2 epi_args2) {
@@ -211,7 +259,9 @@ void bestla_fusion_ffn_f32f32_forward(float* activation, void* w1ptr, void* w2pt
     auto btype = static_cast<gemm::CompType>(gemm::CompTypeHelper::get_B(CType));
     if (ptr1->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
       auto bptr = reinterpret_cast<storage::gemm::IWeightKBlockBase*>(ptr1);
+      auto bptr2 = reinterpret_cast<storage::gemm::IWeightKBlockBase*>(ptr2);
       auto BlkSize = bptr->mBlockSize;
+      auto BlkSize2 = bptr2->mBlockSize;
       if (btype == gemm::CompType::tFP32 && PackRow == 1) {
         if (NTile == tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
           BTLAGemmCompF32<tAVX512F, tWeiNInt, tActKBaseF32, epilogue1, epilogue2>(
@@ -234,22 +284,53 @@ void bestla_fusion_ffn_f32f32_forward(float* activation, void* w1ptr, void* w2pt
         }
       }
       if (btype == gemm::CompType::tS8 && PackRow == 4) {
-        if (NTile == tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
-          BTLAGemmCompInt8<tAMX_INT8_SS_KBlock, tWeiNInt, epilogue1, epilogue2>(
-              activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+        if (NTile == tAMX_INT8_US_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_US_KBlock::KTILE == 0) {
+          if (BlkSize < fin || BlkSize2 < fmid) {
+            BTLAGemmCompInt8<tAMX_INT8_US_KBlock, tWeiNInt, epilogue1, epilogue2>(
+                activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+          } else {
+            BTLAGemmCompInt8Pc<tAMX_INT8_US, tWeiNInt, epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue1>,
+                               epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue2>>(
+                activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+          }
+
         } else if (NTile == tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
                    BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
-          BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt, epilogue1, epilogue2>(
-              activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+          if (BlkSize < fin || BlkSize2 < fmid) {
+            BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt, epilogue1, epilogue2>(
+                activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+          } else {
+            BTLAGemmCompInt8Pc<tAVX512_VNNI, tWeiNInt, epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue1>,
+                               epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue2>>(
+                activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+          }
         } else if (NTile == tAVX512BW_KBlock::NTILE && _cd->AVX512BW() && BlkSize % tAVX512BW_KBlock::KTILE == 0) {
-          BTLAGemmCompInt8<tAVX512BW_KBlock, tWeiNInt, epilogue1, epilogue2>(
-              activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+          if (BlkSize < fin || BlkSize2 < fmid) {
+            BTLAGemmCompInt8<tAVX512BW_KBlock, tWeiNInt, epilogue1, epilogue2>(
+                activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+          } else {
+            BTLAGemmCompInt8Pc<tAVX512BW, tWeiNInt, epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue1>,
+                               epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue2>>(
+                activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+          }
         } else if (NTile == tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
-          BTLAGemmCompInt8<tAVX_VNNI_KBlock, tWeiNInt, epilogue1, epilogue2>(
-              activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+          if (BlkSize < fin || BlkSize2 < fmid) {
+            BTLAGemmCompInt8<tAVX_VNNI_KBlock, tWeiNInt, epilogue1, epilogue2>(
+                activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+          } else {
+            BTLAGemmCompInt8Pc<tAVX_VNNI, tWeiNInt, epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue1>,
+                               epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue2>>(
+                activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+          }
         } else if (NTile == tAVX2_VNNI_KBlock::NTILE && _cd->AVX2() && BlkSize % tAVX2_VNNI_KBlock::KTILE == 0) {
-          BTLAGemmCompInt8<tAVX2_VNNI_KBlock, tWeiNInt, epilogue1, epilogue2>(
-              activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+          if (BlkSize < fin || BlkSize2 < fmid) {
+            BTLAGemmCompInt8<tAVX2_VNNI_KBlock, tWeiNInt, epilogue1, epilogue2>(
+                activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+          } else {
+            BTLAGemmCompInt8Pc<tAVX2_VNNI, tWeiNInt, epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue1>,
+                               epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue2>>(
+                activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+          }
         }
       }
     }
@@ -364,12 +445,11 @@ void GemmRun_ffn(Launch_T1* launcher1, Launch_T2* launcher2, Launch_T3* launcher
 }
 
 template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T, template <class, BTLA_ISA> class Act_T,
-          template <BTLA_ISA> class Epi_T1, template <BTLA_ISA> class Epi_T2>
+          class Epi_T1, class Epi_T2>
 void BTLAGemmCompF32(float* activation, storage::gemm::IWeightBase* w1ptr, storage::gemm::IWeightBase* w2ptr,
                      storage::gemm::IWeightBase* w3ptr, float* tmp1, float* tmp2, float* output, int seq, int fin,
-                     int fmid, int fout, void* workspace, parallel::IThreading* th,
-                     typename Epi_T1<GemmCore_T::ISA>::Param epi_prama1,
-                     typename Epi_T2<GemmCore_T::ISA>::Param epi_prama2) {
+                     int fmid, int fout, void* workspace, parallel::IThreading* th, typename Epi_T1::Param epi_prama1,
+                     typename Epi_T2::Param epi_prama2) {
   using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
   using Launcher_epi = wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, Act_T, Wei_T, Epi_T1>;
   using Launcher_mul =
@@ -393,13 +473,67 @@ void BTLAGemmCompF32(float* activation, storage::gemm::IWeightBase* w1ptr, stora
   GemmRun_ffn<Parallel>(&kernel_epi, &kernel_mul, &kernel, args1, args3, args2, th);
 }
 
-template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T, template <BTLA_ISA> class Epi_T1,
-          template <BTLA_ISA> class Epi_T2>
+template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T, class Epi_T1, class Epi_T2>
+void BTLAGemmCompInt8Pc(float* activation, storage::gemm::IWeightBase* w1ptr, storage::gemm::IWeightBase* w2ptr,
+                        storage::gemm::IWeightBase* w3ptr, float* tmp1, float* tmp2, float* output, int seq, int fin,
+                        int fmid, int fout, void* workspace, parallel::IThreading* th,
+                        typename Epi_T1::Fp32Param epi_prama1, typename Epi_T2::Fp32Param epi_prama2) {
+  using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
+  using Launcher_epi = wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T,
+                                                   prologue_a::gemm::ShuffleActivationKBlockQuantizeF32, Wei_T, Epi_T1>;
+  using Launcher_mul =
+      wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ShuffleActivationKBlockQuantizeF32,
+                                  Wei_T, epilogue::gemm::PcKBlockCompInt8Epilogue<custom::epilogue::MulFp32>>;
+  using Launcher = wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T,
+                                               prologue_a::gemm::ShuffleActivationKBlockQuantizeF32, Wei_T, Epi_T2>;
+  auto w1ptr_ = reinterpret_cast<typename Launcher_epi::PrologueB::StorageWeight*>(w1ptr);
+  auto w2ptr_ = reinterpret_cast<typename Launcher::PrologueB::StorageWeight*>(w2ptr);
+  auto w3ptr_ = reinterpret_cast<typename Launcher_mul::PrologueB::StorageWeight*>(w3ptr);
+  utils::GemmProblem gp1(1, seq, fmid, fin, w1ptr_->mBlockSize);
+  utils::GemmProblem gp2(1, seq, fout, fmid, w2ptr_->mBlockSize);
+  utils::GemmProblem gp3(1, seq, fmid, fin, w3ptr_->mBlockSize);
+  static Launcher_epi kernel_epi;
+  static Launcher_mul kernel_mul;
+  static Launcher kernel;
+  auto quanA1 = kernel_epi.mProA.createStorage(seq, fin, w1ptr_->mBlockSize, w1ptr_->IsAsym());
+  auto WS = reinterpret_cast<int8_t*>(workspace);
+  quanA1.assign(WS);
+
+  auto quanA2 = kernel.mProA.createStorage(seq, fmid, w2ptr_->mBlockSize, w2ptr_->IsAsym());
+  WS = reinterpret_cast<int8_t*>(workspace);
+  quanA2.assign(WS);
+  assert(w1ptr_->ShfIndice() == nullptr);
+  assert(w2ptr_->ShfIndice() == nullptr);
+  assert(w3ptr_->ShfIndice() == nullptr);
+  typename Launcher_epi::Param args1{
+      gp1,
+      {activation, fin, &quanA1},
+      {w1ptr_},
+      {{w1ptr_->template SPtr<char>(), w1ptr_->SDtype(), quanA1.template SPtr<float>(), quanA1.template ZPtr<uint8_t>(),
+        w1ptr_->template RPtr<char>(), w1ptr_->RDtype(), nullptr, nullptr, fin},
+       epi_prama1}};
+  typename Launcher::Param args2{
+      gp2,
+      {tmp2, fmid, &quanA2},
+      {w2ptr_},
+      {{w2ptr_->template SPtr<char>(), w2ptr_->SDtype(), quanA2.template SPtr<float>(), quanA2.template ZPtr<uint8_t>(),
+        w2ptr_->template RPtr<char>(), w2ptr_->RDtype(), nullptr, nullptr, fmid},
+       epi_prama2}};
+  typename Launcher_mul::Param args3{
+      gp3,
+      {activation, fin, &quanA1},
+      {w3ptr_},
+      {{w3ptr_->template SPtr<char>(), w3ptr_->SDtype(), quanA1.template SPtr<float>(), quanA1.template ZPtr<uint8_t>(),
+        w3ptr_->template RPtr<char>(), w3ptr_->RDtype(), nullptr, nullptr, fin},
+       {tmp2, tmp1, fmid, fmid}}};
+  GemmRunWithA_ffn<Parallel>(&kernel_epi, &kernel_mul, &kernel, args1, args3, args2, th);
+}
+
+template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T, class Epi_T1, class Epi_T2>
 void BTLAGemmCompInt8(float* activation, storage::gemm::IWeightBase* w1ptr, storage::gemm::IWeightBase* w2ptr,
                       storage::gemm::IWeightBase* w3ptr, float* tmp1, float* tmp2, float* output, int seq, int fin,
-                      int fmid, int fout, void* workspace, parallel::IThreading* th,
-                      typename Epi_T1<GemmCore_T::ISA>::Param epi_prama1,
-                      typename Epi_T2<GemmCore_T::ISA>::Param epi_prama2) {
+                      int fmid, int fout, void* workspace, parallel::IThreading* th, typename Epi_T1::Param epi_prama1,
+                      typename Epi_T2::Param epi_prama2) {
   using Parallel = parallel::gemm::SchedulerKBlockS<GemmCore_T>;
   using Launcher_epi =
       wrapper::gemm::LauncherIntKBlock<GemmCore_T::ISA, GemmCore_T,
@@ -466,8 +600,7 @@ bool bestla_fusion_ffn_f32f32_support(void* w1ptr, void* w2ptr, void* w3ptr, int
   return support;
 }
 
-template <template <BTLA_ISA> class epilogue1, template <BTLA_ISA> class epilogue2, typename Epi_args1,
-          typename Epi_args2>
+template <class epilogue1, class epilogue2, typename Epi_args1, typename Epi_args2>
 void bestla_fusion_ffn_f32f32_forward(float* activation, void* w1ptr, void* w2ptr, void* w3ptr, float* tmp1,
                                       float* tmp2, float* output, int seq, int fin, int fmid, int fout, void* workspace,
                                       Epi_args1 epi_args1, Epi_args2 epi_args2) {
@@ -485,7 +618,9 @@ void bestla_fusion_ffn_f32f32_forward(float* activation, void* w1ptr, void* w2pt
     auto btype = static_cast<gemm::CompType>(gemm::CompTypeHelper::get_B(CType));
     if (ptr1->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
       auto bptr = reinterpret_cast<storage::gemm::IWeightKBlockBase*>(ptr1);
+      auto bptr2 = reinterpret_cast<storage::gemm::IWeightKBlockBase*>(ptr2);
       auto BlkSize = bptr->mBlockSize;
+      auto BlkSize2 = bptr2->mBlockSize;
       if (btype == gemm::CompType::tFP32 && PackRow == 1) {
         if (NTile == tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
           BTLAGemmCompF32<tAVX512F, tWeiNInt, tActKBaseF32, epilogue1, epilogue2>(activation, ptr1, ptr2, ptr3, tmp1,
@@ -512,35 +647,63 @@ void bestla_fusion_ffn_f32f32_forward(float* activation, void* w1ptr, void* w2pt
         }
       }
       if (btype == gemm::CompType::tS8 && PackRow == 4) {
-        if (NTile == tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
-          if (seq <= tAVX512_VNNI_KBlock::MTILE) {
-            static_assert(tAVX512_VNNI_KBlock::NTILE == tAMX_INT8_SS_KBlock::NTILE);
-            BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt, epilogue1, epilogue2>(activation, ptr1, ptr2, ptr3, tmp1,
+        if (NTile == tAMX_INT8_US_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_US_KBlock::KTILE == 0) {
+          if (BlkSize < fin || BlkSize2 < fmid) {
+            BTLAGemmCompInt8<tAMX_INT8_US_KBlock, tWeiNInt, epilogue1, epilogue2>(activation, ptr1, ptr2, ptr3, tmp1,
                                                                                   tmp2, output, seq, fin, fmid, fout,
                                                                                   workspace, pth, epi_args1, epi_args2);
           } else {
-            BTLAGemmCompInt8<tAMX_INT8_SS_KBlock, tWeiNInt, epilogue1, epilogue2>(activation, ptr1, ptr2, ptr3, tmp1,
-                                                                                  tmp2, output, seq, fin, fmid, fout,
-                                                                                  workspace, pth, epi_args1, epi_args2);
+            BTLAGemmCompInt8Pc<tAMX_INT8_US, tWeiNInt, epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue1>,
+                               epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue2>>(
+                activation, ptr1, ptr2, ptr3, tmp1, tmp2, output, seq, fin, fmid, fout, workspace, pth, epi_args1,
+                epi_args2);
           }
 
         } else if (NTile == tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
                    BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
-          BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt, epilogue1, epilogue2>(activation, ptr1, ptr2, ptr3, tmp1,
-                                                                                tmp2, output, seq, fin, fmid, fout,
-                                                                                workspace, pth, epi_args1, epi_args2);
+          if (BlkSize < fin || BlkSize2 < fmid) {
+            BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt, epilogue1, epilogue2>(activation, ptr1, ptr2, ptr3, tmp1,
+                                                                                  tmp2, output, seq, fin, fmid, fout,
+                                                                                  workspace, pth, epi_args1, epi_args2);
+          } else {
+            BTLAGemmCompInt8Pc<tAVX512_VNNI, tWeiNInt, epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue1>,
+                               epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue2>>(
+                activation, ptr1, ptr2, ptr3, tmp1, tmp2, output, seq, fin, fmid, fout, workspace, pth, epi_args1,
+                epi_args2);
+          }
         } else if (NTile == tAVX512BW_KBlock::NTILE && _cd->AVX512BW() && BlkSize % tAVX512BW_KBlock::KTILE == 0) {
-          BTLAGemmCompInt8<tAVX512BW_KBlock, tWeiNInt, epilogue1, epilogue2>(activation, ptr1, ptr2, ptr3, tmp1, tmp2,
-                                                                             output, seq, fin, fmid, fout, workspace,
-                                                                             pth, epi_args1, epi_args2);
+          if (BlkSize < fin || BlkSize2 < fmid) {
+            BTLAGemmCompInt8<tAVX512BW_KBlock, tWeiNInt, epilogue1, epilogue2>(activation, ptr1, ptr2, ptr3, tmp1, tmp2,
+                                                                               output, seq, fin, fmid, fout, workspace,
+                                                                               pth, epi_args1, epi_args2);
+          } else {
+            BTLAGemmCompInt8Pc<tAVX512BW, tWeiNInt, epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue1>,
+                               epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue2>>(
+                activation, ptr1, ptr2, ptr3, tmp1, tmp2, output, seq, fin, fmid, fout, workspace, pth, epi_args1,
+                epi_args2);
+          }
         } else if (NTile == tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
-          BTLAGemmCompInt8<tAVX_VNNI_KBlock, tWeiNInt, epilogue1, epilogue2>(activation, ptr1, ptr2, ptr3, tmp1, tmp2,
-                                                                             output, seq, fin, fmid, fout, workspace,
-                                                                             pth, epi_args1, epi_args2);
+          if (BlkSize < fin || BlkSize2 < fmid) {
+            BTLAGemmCompInt8<tAVX_VNNI_KBlock, tWeiNInt, epilogue1, epilogue2>(activation, ptr1, ptr2, ptr3, tmp1, tmp2,
+                                                                               output, seq, fin, fmid, fout, workspace,
+                                                                               pth, epi_args1, epi_args2);
+          } else {
+            BTLAGemmCompInt8Pc<tAVX_VNNI, tWeiNInt, epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue1>,
+                               epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue2>>(
+                activation, ptr1, ptr2, ptr3, tmp1, tmp2, output, seq, fin, fmid, fout, workspace, pth, epi_args1,
+                epi_args2);
+          }
         } else if (NTile == tAVX2_VNNI_KBlock::NTILE && _cd->AVX2() && BlkSize % tAVX2_VNNI_KBlock::KTILE == 0) {
-          BTLAGemmCompInt8<tAVX2_VNNI_KBlock, tWeiNInt, epilogue1, epilogue2>(activation, ptr1, ptr2, ptr3, tmp1, tmp2,
-                                                                              output, seq, fin, fmid, fout, workspace,
-                                                                              pth, epi_args1, epi_args2);
+          if (BlkSize < fin || BlkSize2 < fmid) {
+            BTLAGemmCompInt8<tAVX2_VNNI_KBlock, tWeiNInt, epilogue1, epilogue2>(activation, ptr1, ptr2, ptr3, tmp1,
+                                                                                tmp2, output, seq, fin, fmid, fout,
+                                                                                workspace, pth, epi_args1, epi_args2);
+          } else {
+            BTLAGemmCompInt8Pc<tAVX2_VNNI, tWeiNInt, epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue1>,
+                               epilogue::gemm::PcKBlockCompInt8Epilogue<epilogue2>>(
+                activation, ptr1, ptr2, ptr3, tmp1, tmp2, output, seq, fin, fmid, fout, workspace, pth, epi_args1,
+                epi_args2);
+          }
         }
       }
     }
diff --git a/neural_speed/core/layers/ip_fusion_qkv.cpp b/neural_speed/core/layers/ip_fusion_qkv.cpp
index ddb3303f3..9667b3f1d 100644
--- a/neural_speed/core/layers/ip_fusion_qkv.cpp
+++ b/neural_speed/core/layers/ip_fusion_qkv.cpp
@@ -94,6 +94,46 @@ void BTLAGemmCompF32(const int M, const int N, const int K, const float* A, cons
   }
 }
 
+template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T>
+void BTLAGemmCompInt8Pc(const int M, const int N, const int K, const float* A, const int lda,
+                        storage::gemm::IWeightBase* _BQ, storage::gemm::IWeightBase* _BK,
+                        storage::gemm::IWeightBase* _BV, float* C, const int ldc, int8_t* WorkSpace,
+                        parallel::IThreading* th) {
+  using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
+  using Launcher = tLauncher_Int8Pc_F32F32<GemmCore_T, Wei_T>;
+  auto BQ = reinterpret_cast<typename Launcher::PrologueB::StorageWeight*>(_BQ);
+  auto BK = reinterpret_cast<typename Launcher::PrologueB::StorageWeight*>(_BK);
+  auto BV = reinterpret_cast<typename Launcher::PrologueB::StorageWeight*>(_BV);
+  static Launcher kernel;
+  auto quanA = kernel.mProA.createStorage(M, K, BQ->mBlockSize, BQ->IsAsym());
+  quanA.assign(WorkSpace);
+  WorkSpace += quanA.mSize;
+  auto reordA = kernel.mProA.createReorderStorage(M, K, BQ->mBlockSize);
+  utils::GemmProblem gp(1, M, N, K, BQ->mBlockSize);
+  assert(BQ->mBlockSize == BK->mBlockSize);
+  assert(BQ->mBlockSize == BV->mBlockSize);
+  typename Launcher::Param args[3]{
+      {gp,
+       {A, K, &quanA, BQ->ShfIndice(), &reordA},
+       {BQ},
+       {{BQ->template SPtr<char>(), BQ->SDtype(), quanA.template SPtr<float>(), quanA.template ZPtr<uint8_t>(),
+         BQ->template RPtr<char>(), BQ->RDtype(), nullptr, nullptr, K},
+        {C, N}}},
+      {gp,
+       {A, K, &quanA, BK->ShfIndice(), &reordA},
+       {BK},
+       {{BK->template SPtr<char>(), BK->SDtype(), quanA.template SPtr<float>(), quanA.template ZPtr<uint8_t>(),
+         BK->template RPtr<char>(), BK->RDtype(), nullptr, nullptr, K},
+        {C + M * ldc, N}}},
+      {gp,
+       {A, K, &quanA, BV->ShfIndice(), &reordA},
+       {BV},
+       {{BV->template SPtr<char>(), BV->SDtype(), quanA.template SPtr<float>(), quanA.template ZPtr<uint8_t>(),
+         BV->template RPtr<char>(), BV->RDtype(), nullptr, nullptr, K},
+        {C + M * ldc * 2, N}}}};
+  GemmRunWithA_QKV<Parallel>(&kernel, args, th);
+}
+
 template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T>
 void BTLAGemmCompInt8(const int M, const int N, const int K, const float* A, const int lda,
                       storage::gemm::IWeightBase* _BQ, storage::gemm::IWeightBase* _BK, storage::gemm::IWeightBase* _BV,
@@ -195,22 +235,48 @@ void bestla_fusion_QKV_f32f32_forward(float* activation, void* wqptr, void* wkpt
       }
     }
     if (btype == gemm::CompType::tS8 && PackRow == 4) {
-      if (NTile == tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
-        ip_qkv::BTLAGemmCompInt8<tAMX_INT8_SS_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp,
-                                                                output, ldo, workspace, pth);
+      if (NTile == tAMX_INT8_US_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_US_KBlock::KTILE == 0) {
+        if (BlkSize < _k) {
+          ip_qkv::BTLAGemmCompInt8<tAMX_INT8_US_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp,
+                                                                  output, ldo, workspace, pth);
+        } else {
+          ip_qkv::BTLAGemmCompInt8Pc<tAMX_INT8_US, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp, output,
+                                                             ldo, workspace, pth);
+        }
+
       } else if (NTile == tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
                  BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
-        ip_qkv::BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp,
-                                                                output, ldo, workspace, pth);
-      } else if (NTile == tAVX512BW_KBlock::NTILE && _cd->AVX512BW() && BlkSize % tAVX512BW_KBlock::KTILE == 0) {
-        ip_qkv::BTLAGemmCompInt8<tAVX512BW_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp, output,
+        if (BlkSize < _k) {
+          ip_qkv::BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp,
+                                                                  output, ldo, workspace, pth);
+        } else {
+          ip_qkv::BTLAGemmCompInt8Pc<tAVX512_VNNI, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp, output,
                                                              ldo, workspace, pth);
+        }
+      } else if (NTile == tAVX512BW_KBlock::NTILE && _cd->AVX512BW() && BlkSize % tAVX512BW_KBlock::KTILE == 0) {
+        if (BlkSize < _k) {
+          ip_qkv::BTLAGemmCompInt8<tAVX512BW_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp, output,
+                                                               ldo, workspace, pth);
+        } else {
+          ip_qkv::BTLAGemmCompInt8Pc<tAVX512BW, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp, output, ldo,
+                                                          workspace, pth);
+        }
       } else if (NTile == tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
-        ip_qkv::BTLAGemmCompInt8<tAVX_VNNI_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp, output,
-                                                             ldo, workspace, pth);
+        if (BlkSize < _k) {
+          ip_qkv::BTLAGemmCompInt8<tAVX_VNNI_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp, output,
+                                                               ldo, workspace, pth);
+        } else {
+          ip_qkv::BTLAGemmCompInt8Pc<tAVX_VNNI, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp, output, ldo,
+                                                          workspace, pth);
+        }
       } else if (NTile == tAVX2_VNNI_KBlock::NTILE && _cd->AVX2() && BlkSize % tAVX2_VNNI_KBlock::KTILE == 0) {
-        ip_qkv::BTLAGemmCompInt8<tAVX2_VNNI_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp, output,
-                                                              ldo, workspace, pth);
+        if (BlkSize < _k) {
+          ip_qkv::BTLAGemmCompInt8<tAVX2_VNNI_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp,
+                                                                output, ldo, workspace, pth);
+        } else {
+          ip_qkv::BTLAGemmCompInt8Pc<tAVX2_VNNI, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp, output,
+                                                           ldo, workspace, pth);
+        }
       }
     }
   }
diff --git a/neural_speed/core/layers/mha_dense_wrapper.h b/neural_speed/core/layers/mha_dense_wrapper.h
index 733e4ba0a..6770a11bb 100644
--- a/neural_speed/core/layers/mha_dense_wrapper.h
+++ b/neural_speed/core/layers/mha_dense_wrapper.h
@@ -143,7 +143,7 @@ alignas(32) const uint32_t mask8[9][8]{
  * @brief An Epilogue that optionally apply a casual mask and scale the fp32 result, performing exp, accumulating sum of
  * each line of exp, and storing exp as bf16 results
  */
-template <BTLA_ISA ISA_T, typename T_DST>
+template <typename T_DST>
 class scale_exp_acc_sum_fp32_t {
  public:
   struct Param {  // NOLINT(readability-identifier-naming): align with bestla name
@@ -154,10 +154,10 @@ class scale_exp_acc_sum_fp32_t {
     int causal_offset;  // offset for causal mask; negative value for disabling causal mask
     float alibi_slope;  // m-factor in the alibi paper for current head: https://arxiv.org/abs/2108.12409
   };
-
-  TARGET_512 BTLA_CODE forward(const float* src, const int src_step, const int M_offset, const int N_offset,
-                               const int M, const int N, const Param& p, void* /* tmpcache */,
-                               size_t /* cachesize */) const {
+  template <BTLA_ISA ISA_T>
+  TARGET_512 static BTLA_CODE forward(const float* src, const int src_step, const int M_offset, const int N_offset,
+                                      const int M, const int N, const Param& p, void* /* tmpcache */,
+                                      size_t /* cachesize */) {
     assert(("alibi not supported!", p.alibi_slope == 0.f));
     const auto dst = p.dst + M_offset * p.ld_dst + N_offset;
     const auto dst_sum = p.dst_sum + M_offset;
@@ -204,13 +204,12 @@ class scale_exp_acc_sum_fp32_t {
     return BTLA_CODE::Success;
   }
 };
-template <BTLA_ISA ISA_T>
-using ScaleExpAccSumFp32Bf16 = scale_exp_acc_sum_fp32_t<ISA_T, bf16>;
+using ScaleExpAccSumFp32Bf16 = scale_exp_acc_sum_fp32_t<bf16>;
 
 /**
  * @brief An Epilogue that scale the fp32 result, convert to bf16 and write back to memory
  */
-template <BTLA_ISA ISA_T, typename T_SRC, typename T_DST>
+template <typename T_SRC, typename T_DST>
 class scale_write_back_t {
  public:
   using SType = T_SRC;
@@ -220,9 +219,9 @@ class scale_write_back_t {
     DType* dst;
     int ld_dst;
   };
-
-  BTLA_CODE forward(const SType* src, const int src_step, const int M_offset, const int N_offset, const int M,
-                    const int N, const Param& p, void* /* tmpcache */, size_t /* cachesize */) {
+  template <BTLA_ISA ISA_T>
+  static BTLA_CODE forward(const SType* src, const int src_step, const int M_offset, const int N_offset, const int M,
+                           const int N, const Param& p, void* /* tmpcache */, size_t /* cachesize */) {
     const auto dst = p.dst + M_offset * p.ld_dst + N_offset;
     const auto scale = p.scale + M_offset;
     // TODO(Yi): high performance implementation
@@ -233,12 +232,9 @@ class scale_write_back_t {
     return BTLA_CODE::Success;
   }
 };
-template <BTLA_ISA ISA_T>
-using ScaleWriteBackFp32Bf16 = scale_write_back_t<ISA_T, float, bf16>;
-template <BTLA_ISA ISA_T>
-using ScaleWriteBackFp32Fp32 = scale_write_back_t<ISA_T, float, float>;
-template <BTLA_ISA ISA_T>
-using ScaleWriteBackS32S8 = scale_write_back_t<ISA_T, int32_t, int8_t>;
+using ScaleWriteBackFp32Bf16 = scale_write_back_t<float, bf16>;
+using ScaleWriteBackFp32Fp32 = scale_write_back_t<float, float>;
+using ScaleWriteBackS32S8 = scale_write_back_t<int32_t, int8_t>;
 
 /**
  * @brief PackedWeight(Default) with batch
@@ -443,7 +439,7 @@ class activation_identity_t {
  * @brief LauncherBase with addition input as packed weight offset
  */
 template <BTLA_ISA RT_ISA_, class _GemmCore_T, template <class, BTLA_ISA> class _PrologueA_T,
-          template <class, BTLA_ISA> class _PrologueB_T, template <BTLA_ISA> class _Epilogue_T>
+          template <class, BTLA_ISA> class _PrologueB_T, class _Epilogue_T>
 class launcher_base_off_t                  //
     : public wrapper::gemm::LauncherBase<  //
           RT_ISA_, _GemmCore_T, _PrologueA_T, _PrologueB_T, _Epilogue_T> {
@@ -523,8 +519,8 @@ class launcher_base_off_t                  //
         }
       }
     }
-    this->mEpilogue.forward(tmpC, _config.block[1], _config.loc[0] + blk_m, _config.loc[1] + blk_n, blk_msize,
-                            blk_nsize, _param.paramC, tmpcache, _config.tmpcachesize);
+    Base::Epilogue::template forward<Base::ISA>(tmpC, _config.block[1], _config.loc[0] + blk_m, _config.loc[1] + blk_n,
+                                                blk_msize, blk_nsize, _param.paramC, tmpcache, _config.tmpcachesize);
   }
 };
 
@@ -532,7 +528,7 @@ class launcher_base_off_t                  //
  * @brief LauncherBase with addition input as packed weight offset
  */
 template <BTLA_ISA RT_ISA_, class _GemmCore_T, template <class, BTLA_ISA> class _PrologueA_T,
-          template <class, BTLA_ISA> class _PrologueB_T, template <BTLA_ISA> class _Epilogue_T>
+          template <class, BTLA_ISA> class _PrologueB_T, class _Epilogue_T>
 class launcher_base_weight_t               //
     : public wrapper::gemm::LauncherBase<  //
           RT_ISA_, _GemmCore_T, _PrologueA_T, _PrologueB_T, _Epilogue_T> {
@@ -607,8 +603,9 @@ class launcher_base_weight_t               //
         }
       }
     }
-    this->mEpilogue.forward(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize,
-                            blk_nsize, _param.paramC, tmpcache, _config.tmpcachesize);
+    Base::Epilogue::template forward<Base::ISA>(tmpC, _config.block[1], (_config.loc[0] + blk_m),
+                                                _config.loc[1] + blk_n, blk_msize, blk_nsize, _param.paramC, tmpcache,
+                                                _config.tmpcachesize);
   }
 };
 
@@ -830,21 +827,21 @@ class mha_interface_t {
  * @brief An Epilogue that optionally apply a casual mask (but may not filling zero) and scale the fp32 result, update
  * the maximum of each line of the result, and storing exp as bf16 results
  */
-template <BTLA_ISA ISA_T, typename T_SRC, typename T_DST>
+template <typename T_SRC, typename T_DST>
 class scale_track_max_t {
  public:
   using DType = T_DST;
   using SType = T_SRC;
   struct Param;
-
-  BTLA_CODE forward(const SType* src, const int src_step, const int M_offset, const int N_offset, const int M,
-                    const int N, const Param& p) const {
+  template <BTLA_ISA ISA_T>
+  static BTLA_CODE forward(const SType* src, const int src_step, const int M_offset, const int N_offset, const int M,
+                           const int N, const Param& p) {
     assert(false);
     return BTLA_CODE::NotSupport;
   }
 };
-template <BTLA_ISA ISA_T>
-class scale_track_max_t<ISA_T, fp16, float> {
+template <>
+class scale_track_max_t<fp16, float> {
  public:
   using DType = float;
   using SType = fp16;
@@ -858,9 +855,10 @@ class scale_track_max_t<ISA_T, fp16, float> {
     float tanh_scale;
   };
 
-  TARGET_512 BTLA_CODE forward(const SType* src, const int src_step, const int M_offset, const int N_offset,
-                               const int M, const int N, const Param& p, void* /* tmpcache */,
-                               size_t /* cachesize */) const {
+  template <BTLA_ISA ISA_T>
+  TARGET_512 static BTLA_CODE forward(const SType* src, const int src_step, const int M_offset, const int N_offset,
+                                      const int M, const int N, const Param& p, void* /* tmpcache */,
+                                      size_t /* cachesize */) {
     assert(("alibi not supported!", p.alibi_slope == 0.f));
     const auto dst = p.dst + M_offset * p.ld_dst + N_offset;
     const auto dst_max = p.dst_max + M_offset;
@@ -905,18 +903,16 @@ class scale_track_max_t<ISA_T, fp16, float> {
         memset(dst + i * p.ld_dst + N_unmasked, 0, sizeof(*dst) * (utils::padto(N, 64) - N_unmasked));
     }
 #endif
-
     return BTLA_CODE::Success;
 #else
     return BTLA_CODE::NotSupport;
 #endif
   }
 };
-template <BTLA_ISA ISA_T>
-using ScaleTrackMaxFp16Fp32 = scale_track_max_t<ISA_T, fp16, float>;
+using ScaleTrackMaxFp16Fp32 = scale_track_max_t<fp16, float>;
 
-template <BTLA_ISA ISA_T>
-class scale_track_max_t<ISA_T, float, float> {
+template <>
+class scale_track_max_t<float, float> {
  public:
   using DType = float;
   using SType = float;
@@ -931,22 +927,23 @@ class scale_track_max_t<ISA_T, float, float> {
   };
   static constexpr float seq15[16]{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
 
-  BTLA_CODE forward(const SType* src, const int src_step, const int M_offset, const int N_offset, const int M,
-                    const int N, const Param& p, void* /* tmpcache */, size_t /* cachesize */) const {
+  template <BTLA_ISA ISA_T>
+  static BTLA_CODE forward(const SType* src, const int src_step, const int M_offset, const int N_offset, const int M,
+                           const int N, const Param& p, void* /* tmpcache */, size_t /* cachesize */) {
     if (p.alibi_slope == 0 && p.tanh_scale == 0)
-      return forward_<false, false>(src, src_step, M_offset, N_offset, M, N, p);
+      return forward_<ISA_T, false, false>(src, src_step, M_offset, N_offset, M, N, p);
     else if (p.alibi_slope == 0 && p.tanh_scale != 0)
-      return forward_<false, true>(src, src_step, M_offset, N_offset, M, N, p);
+      return forward_<ISA_T, false, true>(src, src_step, M_offset, N_offset, M, N, p);
     else if (p.alibi_slope != 0 && p.tanh_scale == 0)
-      return forward_<true, false>(src, src_step, M_offset, N_offset, M, N, p);
+      return forward_<ISA_T, true, false>(src, src_step, M_offset, N_offset, M, N, p);
     else
       return BTLA_CODE::NotSupport;
   }
 
 #if CompileAVX512F()
   template <bool HAS_ALIBI, bool HAS_TANH>
-  TARGET_512 BTLA_CODE forward_512(const SType* src, const int src_step, const int M_offset, const int N_offset,
-                                   const int M, const int N, const Param& p) const {
+  TARGET_512 static BTLA_CODE forward_512(const SType* src, const int src_step, const int M_offset, const int N_offset,
+                                          const int M, const int N, const Param& p) {
     const auto dst = p.dst + M_offset * p.ld_dst + N_offset;
     const auto dst_max = p.dst_max + M_offset;
     const auto v_scale = _mm512_set1_ps(p.scale);
@@ -997,8 +994,8 @@ class scale_track_max_t<ISA_T, float, float> {
 #endif
 #if CompileAVX2()
   template <bool HAS_ALIBI, bool HAS_TANH>
-  BTLA_CODE forward_avx2(const SType* src, const int src_step, const int M_offset, const int N_offset, const int M,
-                         const int N, const Param& p) const {
+  static BTLA_CODE forward_avx2(const SType* src, const int src_step, const int M_offset, const int N_offset,
+                                const int M, const int N, const Param& p) {
     const auto dst = p.dst + M_offset * p.ld_dst + N_offset;
     const auto dst_max = p.dst_max + M_offset;
     const auto v_scale = _mm256_set1_ps(p.scale);
@@ -1038,9 +1035,9 @@ class scale_track_max_t<ISA_T, float, float> {
     return BTLA_CODE::Success;
   }
 #endif
-  template <bool HAS_ALIBI, bool HAS_TANH>
-  BTLA_CODE forward_(const SType* src, const int src_step, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& p) const {
+  template <BTLA_ISA ISA_T, bool HAS_ALIBI, bool HAS_TANH>
+  static BTLA_CODE forward_(const SType* src, const int src_step, const int M_offset, const int N_offset, const int M,
+                            const int N, const Param& p) {
     const auto dst = p.dst + M_offset * p.ld_dst + N_offset;
     const auto dst_max = p.dst_max + M_offset;
 #if MHA_2ND_EXP
@@ -1071,11 +1068,10 @@ class scale_track_max_t<ISA_T, float, float> {
     return BTLA_CODE::Success;
   }
 };
-template <BTLA_ISA ISA_T>
-using ScaleTrackMaxFp32Fp32 = scale_track_max_t<ISA_T, float, float>;
+using ScaleTrackMaxFp32Fp32 = scale_track_max_t<float, float>;
 
-template <BTLA_ISA ISA_T>
-class scale_track_max_t<ISA_T, int32_t, float> {
+template <>
+class scale_track_max_t<int32_t, float> {
  public:
   using DType = float;
   using SType = int32_t;
@@ -1089,9 +1085,10 @@ class scale_track_max_t<ISA_T, int32_t, float> {
     float tanh_scale;
   };
 
-  TARGET_512 BTLA_CODE forward(const SType* src, const int src_step, const int M_offset, const int N_offset,
-                               const int M, const int N, const Param& p, void* /* tmpcache */,
-                               size_t /* cachesize */) const {
+  template <BTLA_ISA ISA_T>
+  TARGET_512 static BTLA_CODE forward(const SType* src, const int src_step, const int M_offset, const int N_offset,
+                                      const int M, const int N, const Param& p, void* /* tmpcache */,
+                                      size_t /* cachesize */) {
     assert(("alibi not supported!", p.alibi_slope == 0.f));
     const auto dst = p.dst + M_offset * p.ld_dst + N_offset;
     const auto dst_max = p.dst_max + M_offset;
@@ -1127,8 +1124,7 @@ class scale_track_max_t<ISA_T, int32_t, float> {
 #endif
   }
 };
-template <BTLA_ISA ISA_T>
-using ScaleTrackMaxS32Fp32 = scale_track_max_t<ISA_T, int32_t, float>;
+using ScaleTrackMaxS32Fp32 = scale_track_max_t<int32_t, float>;
 
 template <class _GemmCore_T, BTLA_ISA ISA_T>
 class weight_base_t {
diff --git a/neural_speed/core/layers/ne_bestla.cpp b/neural_speed/core/layers/ne_bestla.cpp
index 52517bfba..e9e8f1024 100644
--- a/neural_speed/core/layers/ne_bestla.cpp
+++ b/neural_speed/core/layers/ne_bestla.cpp
@@ -114,3 +114,51 @@ void bestla_layernormalization(int norm_count, int norm_size, bool isrms, float
                                float* FpOut) {
   BTLALayerNorm(norm_count, norm_size, isrms, epsilon, FpIn, FpOut, ne_threading::get());
 }
+
+void bestla_mul(int batch, int vsize, const float* tensor, const float* vector, int vstep, float* out) {
+  auto pth = ne_bestla::ne_threading::get();
+  int threads = batch <= 4 ? 1 : pth->num_threads();
+  parallel::Scheduler2D sch({threads, batch, vsize, 1, 16});
+  auto threadfunc = [&](int tidx) {
+    parallel::ThreadProblem2D tp{tidx};
+    sch.getIndex(tp);
+    if (tp.valid) {
+      for (size_t i = 0; i < tp.size[0]; i++) {
+        auto tptr = tensor + (tp.loc[0] + i) * vsize + tp.loc[1];
+        auto vptr = vector + (tp.loc[0] + i) * vstep + tp.loc[1];
+        auto dstptr = out + (tp.loc[0] + i) * vsize + tp.loc[1];
+        auto ret = kernel::wrapper::Mul<float>::forward_auto(tptr, vptr, dstptr, tp.size[1]);
+      }
+    }
+  };
+  if (threads == 1) {
+    parallel::SingleThread st;
+    st.parallel_for(threadfunc);
+  } else {
+    pth->parallel_for(threadfunc);
+  }
+}
+
+void bestla_add(int batch, int vsize, const float* tensor, const float* vector, int vstep, float* out) {
+  auto pth = ne_bestla::ne_threading::get();
+  int threads = batch <= 4 ? 1 : pth->num_threads();
+  parallel::Scheduler2D sch({threads, batch, vsize, 1, 16});
+  auto threadfunc = [&](int tidx) {
+    parallel::ThreadProblem2D tp{tidx};
+    sch.getIndex(tp);
+    if (tp.valid) {
+      for (size_t i = 0; i < tp.size[0]; i++) {
+        auto tptr = tensor + (tp.loc[0] + i) * vsize + tp.loc[1];
+        auto vptr = vector + (tp.loc[0] + i) * vstep + tp.loc[1];
+        auto dstptr = out + (tp.loc[0] + i) * vsize + tp.loc[1];
+        auto ret = kernel::wrapper::Add<float>::forward_auto(tptr, vptr, dstptr, tp.size[1]);
+      }
+    }
+  };
+  if (threads == 1) {
+    parallel::SingleThread st;
+    st.parallel_for(threadfunc);
+  } else {
+    pth->parallel_for(threadfunc);
+  }
+}
diff --git a/neural_speed/core/ne_bestla.h b/neural_speed/core/ne_bestla.h
index 7d34525fd..77713d35a 100644
--- a/neural_speed/core/ne_bestla.h
+++ b/neural_speed/core/ne_bestla.h
@@ -75,6 +75,9 @@ void bestla_packweight_copyattr(const float* f32ptr, void* dstpr, int n, int k,
 
 void bestla_layernormalization(int norm_count, int norm_size, bool isrms, float epsilon, const float* FpIn,
                                float* FpOut);
+
+void bestla_mul(int batch, int vsize, const float* tensor, const float* vector, int vstep, float* out);
+void bestla_add(int batch, int vsize, const float* tensor, const float* vector, int vstep, float* out);
 #ifdef __cplusplus
 }
 #endif
diff --git a/neural_speed/core/ne_layers.c b/neural_speed/core/ne_layers.c
index 5e22da792..32452fc8c 100644
--- a/neural_speed/core/ne_layers.c
+++ b/neural_speed/core/ne_layers.c
@@ -4397,7 +4397,15 @@ static void ne_compute_forward_add_f32(const struct ne_compute_params* params, c
   NE_ASSERT(nb0 == sizeof(float));
   NE_ASSERT(nb00 == sizeof(float));
   NE_ASSERT(ne00 == ne10);
-
+  if (ne_is_contiguous(src0) && ne_is_contiguous(src1)) {
+    if ((ne_nrows(src1) == 1 || ne_nrows(src1) == ne_nrows(src0)) && ne10 == ne00) {
+      if (nb10 == sizeof(float)) {
+        int step1 = ne11 == 1 ? 0 : ne10;
+        bestla_add(nr, ne00, (const float*)src0->data, (const float*)src1->data, step1, (float*)dst->data);
+        return;
+      }
+    }
+  }
   if (nb10 == sizeof(float)) {
     for (int64_t ir = ith; ir < nr; ir += nth) {
       // src0 and dst are same shape => same indices
@@ -5407,6 +5415,7 @@ static void ne_compute_forward_mul_f32(const struct ne_compute_params* params, c
   if (params->type == NE_TASK_INIT || params->type == NE_TASK_FINALIZE) {
     return;
   }
+
   const int ith = params->ith;
   const int nth = params->nth;
 
@@ -5435,6 +5444,15 @@ static void ne_compute_forward_mul_f32(const struct ne_compute_params* params, c
   const size_t nb1 = dst->nb[1];
   const size_t nb2 = dst->nb[2];
   const size_t nb3 = dst->nb[3];
+  if (ne_is_contiguous(src0) && ne_is_contiguous(src1)) {
+    if ((ne_nrows(src1) == 1 || ne_nrows(src1) == ne_nrows(src0)) && ne10 == ne00) {
+      if (nb10 == sizeof(float)) {
+        int step1 = ne11 == 1 ? 0 : ne10;
+        bestla_mul(nr, ne00, (const float*)src0->data, (const float*)src1->data, step1, (float*)dst->data);
+        return;
+      }
+    }
+  }
 
   NE_ASSERT(nb0 == sizeof(float));
   NE_ASSERT(nb00 == sizeof(float));
@@ -11405,7 +11423,14 @@ void ne_graph_compute(struct ne_context* ctx, struct ne_cgraph* cgraph) {
 
           work_size = MAX(work_size, cur);
         } break;
-        case NE_OP_ADD:
+        case NE_OP_ADD: {
+          if (ne_is_contiguous(node->src1) && ne_is_contiguous(node->src0) &&
+              (ne_nrows(node->src1) == 1 || ne_nrows(node->src1) == ne_nrows(node->src0)) &&
+              node->src0->ne[0] == node->src1->ne[0] && node->nb[0] == sizeof(float)) {
+            node->n_tasks = 1;
+            break;
+          }
+        }
         case NE_OP_ADD1: {
           if (node->src0->ne[1] > 4) {
             node->n_tasks = n_threads;
@@ -11439,6 +11464,14 @@ void ne_graph_compute(struct ne_context* ctx, struct ne_cgraph* cgraph) {
         case NE_OP_TANH: {
           node->n_tasks = 1;
         } break;
+        case NE_OP_MUL: {
+          if (ne_is_contiguous(node->src1) && ne_is_contiguous(node->src0) &&
+              (ne_nrows(node->src1) == 1 || ne_nrows(node->src1) == ne_nrows(node->src0)) &&
+              node->src0->ne[0] == node->src1->ne[0] && node->nb[0] == sizeof(float)) {
+            node->n_tasks = 1;
+            break;
+          }
+        }
         case NE_OP_SQR:
         case NE_OP_SQRT:
         case NE_OP_LOG:
@@ -11448,7 +11481,6 @@ void ne_graph_compute(struct ne_context* ctx, struct ne_cgraph* cgraph) {
         case NE_OP_SGN:
         case NE_OP_NEG:
         case NE_OP_STEP:
-        case NE_OP_MUL:
         case NE_OP_RELU: {
           if (node->src0->ne[1] > 4) {
             node->n_tasks = n_threads;