From 2b5f45a59b0ee2031833394474d335d27d9e1ee0 Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Wed, 22 May 2024 15:09:01 +0800
Subject: [PATCH] update doc and clang-format

---
 README.md                      |  2 +-
 bestla/README.md               |  8 +++++++-
 bestla/bestla/kernel_avx512f.h | 18 +++++++++---------
 docs/advanced_usage.md         |  7 ++++---
 neural_speed/core/README.md    |  6 +++++-
 5 files changed, 26 insertions(+), 15 deletions(-)
diff --git a/README.md b/README.md
index 66abc6afc..bdc8afc88 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 Neural Speed is an innovative library designed to support the efficient inference of large language models (LLMs) on Intel platforms through the state-of-the-art (SOTA) low-bit quantization powered by [Intel Neural Compressor](https://github.com/intel/neural-compressor). The work is inspired by [llama.cpp](https://github.com/ggerganov/llama.cpp) and further optimized for Intel platforms with our innovations in [NeurIPS' 2023](https://arxiv.org/abs/2311.00502)
 
 ## Key Features
-- Highly optimized low-precision kernels on CPUs with ISAs (AMX, VNNI, AVX512F, AVX_VNNI and AVX2). See [details](neural_speed/core/README.md)
+- Highly optimized kernels on CPUs with ISAs (AMX, VNNI, AVX512F, AVX_VNNI and AVX2) for N-bit weight (int1, int2, int3, int4, int5, int6, int7 and int8). See [details](neural_speed/core/README.md)
 - Up to 40x performance speedup on popular LLMs compared with llama.cpp. See [details](https://medium.com/@NeuralCompressor/llm-performance-of-intel-extension-for-transformers-f7d061556176) 
 - Tensor parallelism across sockets/nodes on CPUs. See [details](./docs/tensor_parallelism.md)
 
diff --git a/bestla/README.md b/bestla/README.md
index ca1229ccb..cb4529bb7 100644
--- a/bestla/README.md
+++ b/bestla/README.md
@@ -28,6 +28,7 @@ BesTLA provides weight-only linear computational capabilities for LLM inference.
 | INT5                   | INT8 / BF16 / FP32 |    BF16 / FP32    | sym / asym |
 | INT6                   | INT8 / BF16 / FP32 |    BF16 / FP32    | sym / asym |
 | INT7                   | INT8 / BF16 / FP32 |    BF16 / FP32    | sym / asym |
+| INT1                   | INT8 / BF16 / FP32 |    BF16 / FP32    | sym / asym |
 | FP8 (E4M3, E5M2)       |    BF16 / FP32     | FP32 / FP8 (E8M0) |    sym     |
 | FP4 (E2M1)             |    BF16 / FP32     |    BF16 / FP32    |    sym     |
 | NF4                    |    BF16 / FP32     |    BF16 / FP32    |    sym     |
@@ -49,6 +50,11 @@ BesTLA provides assembly-level postop-fusion through epilogue to minimize the ov
 - RELU
 - EXP
 - TANH
+
+## Optimized thread pool for hybrid CPUs
+Our thread pool is optimized for hybrid CPUs (client CPUs after Core 11th). It will be much faster than OMP thread pool.
+Recommend to use all threads and cores on hybrid CPU: P cores * 2 + E cores.
+
 ## Compilation Requirements and Usage
 Compile: 
 
@@ -67,7 +73,7 @@ Best Performance:
 Usage:
 ```cmake
 add_subdirectory(bestla)
-target_link_libraries("${YOUR_PROJECT}" bestla::bestla)
+target_link_libraries("${YOUR_PROJECT}" neural_speed::bestla)
 ```
 
 # Benchmark
diff --git a/bestla/bestla/kernel_avx512f.h b/bestla/bestla/kernel_avx512f.h
index 268c5fd26..32dc849ea 100644
--- a/bestla/bestla/kernel_avx512f.h
+++ b/bestla/bestla/kernel_avx512f.h
@@ -2594,7 +2594,7 @@ template <int PackRow, int NTILE>
 inline BTLA_CODE decompress_kblock_s4_s8(utils::int4x2* srcptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp,
                                          int n_offset, int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) {
   if (zpptr) {
-    typedef BTLA_CODE (*decompfunc)(utils::int4x2* srcptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp,
+    typedef BTLA_CODE (*decompfunc)(utils::int4x2 * srcptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp,
                                     int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize);
     decompfunc func = nullptr;
     if (col == NTILE) {
@@ -2816,7 +2816,7 @@ static inline BTLA_CODE decompress_kblock_s2_s8(utils::bit2x4* bit2ptr, int8_t*
                                                 int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp,
                                                 size_t tmpsize) {
   if (zpptr) {
-    typedef BTLA_CODE (*decompfunc)(utils::bit2x4* srcptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp,
+    typedef BTLA_CODE (*decompfunc)(utils::bit2x4 * srcptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp,
                                     int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize);
     decompfunc func = nullptr;
     if (col == NTILE) {
@@ -3074,7 +3074,7 @@ static inline BTLA_CODE decompress_kblock_s3_s8(utils::bit2x4* bit2ptr, utils::b
                                                 int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset,
                                                 int row, int col, int8_t* tmp, size_t tmpsize) {
   if (zpptr) {
-    typedef BTLA_CODE (*decompfunc)(utils::bit2x4* bit2ptr, utils::bit1x8* bit1ptr, int8_t* zpptr, int8_t* dstptr,
+    typedef BTLA_CODE (*decompfunc)(utils::bit2x4 * bit2ptr, utils::bit1x8 * bit1ptr, int8_t * zpptr, int8_t * dstptr,
                                     int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp,
                                     size_t tmpsize);
     decompfunc func = nullptr;
@@ -3296,7 +3296,7 @@ static inline BTLA_CODE decompress_kblock_s1_s8(utils::bit1x8* bit1ptr, int8_t*
                                                 int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp,
                                                 size_t tmpsize) {
   if (zpptr) {
-    typedef BTLA_CODE (*decompfunc)(utils::bit1x8* bit1ptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp,
+    typedef BTLA_CODE (*decompfunc)(utils::bit1x8 * bit1ptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp,
                                     int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize);
     decompfunc func = nullptr;
     if (col == NTILE) {
@@ -3545,7 +3545,7 @@ static inline BTLA_CODE decompress_kblock_s5_s8(utils::bit4x2* bit4ptr, utils::b
                                                 int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset,
                                                 int row, int col, int8_t* tmp, size_t tmpsize) {
   if (zpptr) {
-    typedef BTLA_CODE (*decompfunc)(utils::bit4x2* bit4ptr, utils::bit1x8* bit1ptr, int8_t* zpptr, int8_t* dstptr,
+    typedef BTLA_CODE (*decompfunc)(utils::bit4x2 * bit4ptr, utils::bit1x8 * bit1ptr, int8_t * zpptr, int8_t * dstptr,
                                     int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp,
                                     size_t tmpsize);
     decompfunc func = nullptr;
@@ -3859,9 +3859,9 @@ static inline BTLA_CODE decompress_kblock_s7_s8(utils::bit4x2* bit4ptr, utils::b
                                                 int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp, int n_offset,
                                                 int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) {
   if (zpptr) {
-    typedef BTLA_CODE (*decompfunc)(utils::bit4x2* bit4ptr, utils::bit2x4* bit2ptr, utils::bit1x8* bit1ptr,
-                                    int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset,
-                                    int row, int8_t* tmp, size_t tmpsize);
+    typedef BTLA_CODE (*decompfunc)(utils::bit4x2 * bit4ptr, utils::bit2x4 * bit2ptr, utils::bit1x8 * bit1ptr,
+                                    int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp, int n_offset,
+                                    int k_offset, int row, int8_t* tmp, size_t tmpsize);
     decompfunc func = nullptr;
     if (col == NTILE) {
       if constexpr (PackRow == 1) {
@@ -4130,7 +4130,7 @@ static inline BTLA_CODE decompress_kblock_s6_s8(utils::bit4x2* bit4ptr, utils::b
                                                 int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset,
                                                 int row, int col, int8_t* tmp, size_t tmpsize) {
   if (zpptr) {
-    typedef BTLA_CODE (*decompfunc)(utils::bit4x2* bit4ptr, utils::bit2x4* bit2ptr, int8_t* zpptr, int8_t* dstptr,
+    typedef BTLA_CODE (*decompfunc)(utils::bit4x2 * bit4ptr, utils::bit2x4 * bit2ptr, int8_t * zpptr, int8_t * dstptr,
                                     int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp,
                                     size_t tmpsize);
     decompfunc func = nullptr;
diff --git a/docs/advanced_usage.md b/docs/advanced_usage.md
index e31e32468..f1cf65fc0 100644
--- a/docs/advanced_usage.md
+++ b/docs/advanced_usage.md
@@ -6,7 +6,7 @@ Argument description of run.py ([supported MatMul combinations](#supported-matri
 | Argument                    | Description                                                                                                   |
 | --------------              | ---------------------------------------------------------------------                                         |
 | model                       | Directory containing model file or model id: String                                                           |
-| --weight_dtype              | Data type of quantized weight: int4/int3/int2/int5/int6/int7/int8/fp8(=fp8_e4m3)/fp8_e5m2/fp4(=fp4e2m1)/nf4 (default int4)                                                       |
+| --weight_dtype              | Data type of quantized weight: int4/int3/int2/int5/int6/int7/int1/int8/fp8(=fp8_e4m3)/fp8_e5m2/fp4(=fp4e2m1)/nf4 (default int4)                                                       |
 | --alg                       | Quantization algorithm: sym/asym (default sym)                                                                |
 | --group_size                | Group size: Int, 16/32/64/128/-1 (per channel) (default: 32)                                                                                 |
 | --scale_dtype               | Data type of scales: fp32/bf16/fp8 (default fp32)                                                                 |
@@ -60,7 +60,7 @@ Argument description of quantize.py ([supported MatMul combinations](#supported-
 | --build_dir     | Path to the build file: String                               |
 | --config        | Path to the configuration file: String (default: "")         |
 | --nthread       | Number of threads to use: Int (default: 1)                   |
-| --weight_dtype  | Data type of quantized weight: int4/int3/int2/int5/int6/int7/int8/fp8(=fp8_e4m3)/fp8_e5m2/fp4(=fp4_e2m1)/nf4 (default: int4)     |
+| --weight_dtype  | Data type of quantized weight: int4/int3/int2/int5/int6/int7/int1/int8/fp8(=fp8_e4m3)/fp8_e5m2/fp4(=fp4_e2m1)/nf4 (default: int4)     |
 | --alg           | Quantization algorithm to use: sym/asym (default: sym)       |
 | --group_size    | Group size: Int 16/32/64/128/-1 (per channel) (default: 32)                                |
 | --scale_dtype   | Data type of scales: bf16/fp32/fp8 (default: fp32)               |
@@ -69,7 +69,7 @@ Argument description of quantize.py ([supported MatMul combinations](#supported-
 
 #### Supported Matrix Multiplication Data Types Combinations
 
-Our Neural Speed supports  INT4 / INT3 / INT2 / INT5 / INT6 / INT7 / INT8 / FP8 (E4M3, E5M2) / FP4 (E2M1) / NF4 weight-only quantization and FP32 / FP16 / BF16 / INT8 computation forward matmul on the Intel platforms. Here are the all supported data types combinations for matmul operations (quantization and forward).
+Our Neural Speed supports  INT4 / INT3 / INT2 / INT5 / INT6 / INT7 / INT1 / INT8 / FP8 (E4M3, E5M2) / FP4 (E2M1) / NF4 weight-only quantization and FP32 / FP16 / BF16 / INT8 computation forward matmul on the Intel platforms. Here are the all supported data types combinations for matmul operations (quantization and forward).
 > This table will be updated frequently due to active development. For details you can refer to [BesTLA](../bestla#weight-only)
 
 | Weight dtype | Compute dtype (default value) | Scale dtype (default value) | Quantization scheme (default value) |
@@ -82,6 +82,7 @@ Our Neural Speed supports  INT4 / INT3 / INT2 / INT5 / INT6 / INT7 / INT8 / FP8
 | INT5 | INT8 / BF16 / FP16 / FP32 (FP32) | BF16 / FP32 (FP32) | sym / asym (sym) |
 | INT6 | INT8 / BF16 / FP16 / FP32 (FP32) | BF16 / FP32 (FP32) | sym / asym (sym) |
 | INT7 | INT8 / BF16 / FP16 / FP32 (FP32) | BF16 / FP32 (FP32) | sym / asym (sym) |
+| INT1 | INT8 / BF16 / FP16 / FP32 (FP32) | BF16 / FP32 (FP32) | sym / asym (sym) |
 | FP8 (E4M3, E5M2) | BF16 / FP16 / FP32 (FP32) | FP8 (FP8) | sym (sym) |
 | FP4 (E2M1) | BF16 / FP16 / FP32 (FP32) | BF16 / FP32 (FP32) | sym (sym) |
 | NF4 | BF16 / FP16 / FP32 (FP32) | BF16 / FP32 (FP32) | sym (sym) |
diff --git a/neural_speed/core/README.md b/neural_speed/core/README.md
index 091fbdddd..ee4673e59 100644
--- a/neural_speed/core/README.md
+++ b/neural_speed/core/README.md
@@ -27,6 +27,7 @@ int2 | symmetric or asymmetric | multiplier of 8, -1<sup>1</sup>
 int5 | symmetric or asymmetric | multiplier of 8, -1<sup>1</sup>
 int6 | symmetric or asymmetric | multiplier of 8, -1<sup>1</sup>
 int7 | symmetric or asymmetric<sup>2</sup> | multiplier of 8, -1<sup>1</sup>
+int1 | symmetric or asymmetric | multiplier of 8, -1<sup>1</sup>
 int8 | symmetric | multiplier of 8, -1<sup>1</sup>
 fp4 | | multiplier of 8
 nf4 | | multiplier of 8
@@ -84,7 +85,10 @@ Skylake |  sym int3<br>group size=128<br>compute type=fp32 | AVX512F
 Alder Lake (12th Gen)<br>Raptor Lake (13th and 14th Gen)| sym int3<br>group size=128<br>compute type=int8 | AVX_VNNI
 Older architecture (before 12th Gen)|  sym int3<br>group size=128<br>compute type=int8 | AVX2
 
+`sym int5 group=-1 comp_dtype=int8` is the fastest configuration for the first-token with good accuracy.  
+`sym int3 group=128 comp_dtype=int8` is the fastest configuration for the next-token with good accuracy.
+
 NOTE:  
-1. group_size=-1 requires the INC's finetuned model, or it may have lower accuracy than small group sizes. It has the smallest model size, and the fastest first-token performance.
+1. group_size=-1 has the smallest model size, and the fastest first-token performance. But it requires the INC's finetuned model, or it may have lower accuracy than small group sizes. It 
 2. group_size=128 is a balance of accuracy and speed if you want RTN quantization only.
 3. group_size=32, scale_dtype=bf16, compute_dtype=int8, alg=sym equals llama.cpp's Q4_0.