From 2b5f45a59b0ee2031833394474d335d27d9e1ee0 Mon Sep 17 00:00:00 2001 From: luoyu-intel Date: Wed, 22 May 2024 15:09:01 +0800 Subject: [PATCH] update doc and clang-format --- README.md | 2 +- bestla/README.md | 8 +++++++- bestla/bestla/kernel_avx512f.h | 18 +++++++++--------- docs/advanced_usage.md | 7 ++++--- neural_speed/core/README.md | 6 +++++- 5 files changed, 26 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 66abc6afc..bdc8afc88 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ Neural Speed is an innovative library designed to support the efficient inference of large language models (LLMs) on Intel platforms through the state-of-the-art (SOTA) low-bit quantization powered by [Intel Neural Compressor](https://github.com/intel/neural-compressor). The work is inspired by [llama.cpp](https://github.com/ggerganov/llama.cpp) and further optimized for Intel platforms with our innovations in [NeurIPS' 2023](https://arxiv.org/abs/2311.00502) ## Key Features -- Highly optimized low-precision kernels on CPUs with ISAs (AMX, VNNI, AVX512F, AVX_VNNI and AVX2). See [details](neural_speed/core/README.md) +- Highly optimized kernels on CPUs with ISAs (AMX, VNNI, AVX512F, AVX_VNNI and AVX2) for N-bit weight (int1, int2, int3, int4, int5, int6, int7 and int8). See [details](neural_speed/core/README.md) - Up to 40x performance speedup on popular LLMs compared with llama.cpp. See [details](https://medium.com/@NeuralCompressor/llm-performance-of-intel-extension-for-transformers-f7d061556176) - Tensor parallelism across sockets/nodes on CPUs. See [details](./docs/tensor_parallelism.md) diff --git a/bestla/README.md b/bestla/README.md index ca1229ccb..cb4529bb7 100644 --- a/bestla/README.md +++ b/bestla/README.md @@ -28,6 +28,7 @@ BesTLA provides weight-only linear computational capabilities for LLM inference. | INT5 | INT8 / BF16 / FP32 | BF16 / FP32 | sym / asym | | INT6 | INT8 / BF16 / FP32 | BF16 / FP32 | sym / asym | | INT7 | INT8 / BF16 / FP32 | BF16 / FP32 | sym / asym | +| INT1 | INT8 / BF16 / FP32 | BF16 / FP32 | sym / asym | | FP8 (E4M3, E5M2) | BF16 / FP32 | FP32 / FP8 (E8M0) | sym | | FP4 (E2M1) | BF16 / FP32 | BF16 / FP32 | sym | | NF4 | BF16 / FP32 | BF16 / FP32 | sym | @@ -49,6 +50,11 @@ BesTLA provides assembly-level postop-fusion through epilogue to minimize the ov - RELU - EXP - TANH + +## Optimized thread pool for hybrid CPUs +Our thread pool is optimized for hybrid CPUs (client CPUs after Core 11th). It will be much faster than OMP thread pool. +Recommend to use all threads and cores on hybrid CPU: P cores * 2 + E cores. + ## Compilation Requirements and Usage Compile: @@ -67,7 +73,7 @@ Best Performance: Usage: ```cmake add_subdirectory(bestla) -target_link_libraries("${YOUR_PROJECT}" bestla::bestla) +target_link_libraries("${YOUR_PROJECT}" neural_speed::bestla) ``` # Benchmark diff --git a/bestla/bestla/kernel_avx512f.h b/bestla/bestla/kernel_avx512f.h index 268c5fd26..32dc849ea 100644 --- a/bestla/bestla/kernel_avx512f.h +++ b/bestla/bestla/kernel_avx512f.h @@ -2594,7 +2594,7 @@ template inline BTLA_CODE decompress_kblock_s4_s8(utils::int4x2* srcptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) { if (zpptr) { - typedef BTLA_CODE (*decompfunc)(utils::int4x2* srcptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp, + typedef BTLA_CODE (*decompfunc)(utils::int4x2 * srcptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize); decompfunc func = nullptr; if (col == NTILE) { @@ -2816,7 +2816,7 @@ static inline BTLA_CODE decompress_kblock_s2_s8(utils::bit2x4* bit2ptr, int8_t* int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) { if (zpptr) { - typedef BTLA_CODE (*decompfunc)(utils::bit2x4* srcptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp, + typedef BTLA_CODE (*decompfunc)(utils::bit2x4 * srcptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize); decompfunc func = nullptr; if (col == NTILE) { @@ -3074,7 +3074,7 @@ static inline BTLA_CODE decompress_kblock_s3_s8(utils::bit2x4* bit2ptr, utils::b int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) { if (zpptr) { - typedef BTLA_CODE (*decompfunc)(utils::bit2x4* bit2ptr, utils::bit1x8* bit1ptr, int8_t* zpptr, int8_t* dstptr, + typedef BTLA_CODE (*decompfunc)(utils::bit2x4 * bit2ptr, utils::bit1x8 * bit1ptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize); decompfunc func = nullptr; @@ -3296,7 +3296,7 @@ static inline BTLA_CODE decompress_kblock_s1_s8(utils::bit1x8* bit1ptr, int8_t* int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) { if (zpptr) { - typedef BTLA_CODE (*decompfunc)(utils::bit1x8* bit1ptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp, + typedef BTLA_CODE (*decompfunc)(utils::bit1x8 * bit1ptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize); decompfunc func = nullptr; if (col == NTILE) { @@ -3545,7 +3545,7 @@ static inline BTLA_CODE decompress_kblock_s5_s8(utils::bit4x2* bit4ptr, utils::b int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) { if (zpptr) { - typedef BTLA_CODE (*decompfunc)(utils::bit4x2* bit4ptr, utils::bit1x8* bit1ptr, int8_t* zpptr, int8_t* dstptr, + typedef BTLA_CODE (*decompfunc)(utils::bit4x2 * bit4ptr, utils::bit1x8 * bit1ptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize); decompfunc func = nullptr; @@ -3859,9 +3859,9 @@ static inline BTLA_CODE decompress_kblock_s7_s8(utils::bit4x2* bit4ptr, utils::b int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) { if (zpptr) { - typedef BTLA_CODE (*decompfunc)(utils::bit4x2* bit4ptr, utils::bit2x4* bit2ptr, utils::bit1x8* bit1ptr, - int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset, - int row, int8_t* tmp, size_t tmpsize); + typedef BTLA_CODE (*decompfunc)(utils::bit4x2 * bit4ptr, utils::bit2x4 * bit2ptr, utils::bit1x8 * bit1ptr, + int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp, int n_offset, + int k_offset, int row, int8_t* tmp, size_t tmpsize); decompfunc func = nullptr; if (col == NTILE) { if constexpr (PackRow == 1) { @@ -4130,7 +4130,7 @@ static inline BTLA_CODE decompress_kblock_s6_s8(utils::bit4x2* bit4ptr, utils::b int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) { if (zpptr) { - typedef BTLA_CODE (*decompfunc)(utils::bit4x2* bit4ptr, utils::bit2x4* bit2ptr, int8_t* zpptr, int8_t* dstptr, + typedef BTLA_CODE (*decompfunc)(utils::bit4x2 * bit4ptr, utils::bit2x4 * bit2ptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize); decompfunc func = nullptr; diff --git a/docs/advanced_usage.md b/docs/advanced_usage.md index e31e32468..f1cf65fc0 100644 --- a/docs/advanced_usage.md +++ b/docs/advanced_usage.md @@ -6,7 +6,7 @@ Argument description of run.py ([supported MatMul combinations](#supported-matri | Argument | Description | | -------------- | --------------------------------------------------------------------- | | model | Directory containing model file or model id: String | -| --weight_dtype | Data type of quantized weight: int4/int3/int2/int5/int6/int7/int8/fp8(=fp8_e4m3)/fp8_e5m2/fp4(=fp4e2m1)/nf4 (default int4) | +| --weight_dtype | Data type of quantized weight: int4/int3/int2/int5/int6/int7/int1/int8/fp8(=fp8_e4m3)/fp8_e5m2/fp4(=fp4e2m1)/nf4 (default int4) | | --alg | Quantization algorithm: sym/asym (default sym) | | --group_size | Group size: Int, 16/32/64/128/-1 (per channel) (default: 32) | | --scale_dtype | Data type of scales: fp32/bf16/fp8 (default fp32) | @@ -60,7 +60,7 @@ Argument description of quantize.py ([supported MatMul combinations](#supported- | --build_dir | Path to the build file: String | | --config | Path to the configuration file: String (default: "") | | --nthread | Number of threads to use: Int (default: 1) | -| --weight_dtype | Data type of quantized weight: int4/int3/int2/int5/int6/int7/int8/fp8(=fp8_e4m3)/fp8_e5m2/fp4(=fp4_e2m1)/nf4 (default: int4) | +| --weight_dtype | Data type of quantized weight: int4/int3/int2/int5/int6/int7/int1/int8/fp8(=fp8_e4m3)/fp8_e5m2/fp4(=fp4_e2m1)/nf4 (default: int4) | | --alg | Quantization algorithm to use: sym/asym (default: sym) | | --group_size | Group size: Int 16/32/64/128/-1 (per channel) (default: 32) | | --scale_dtype | Data type of scales: bf16/fp32/fp8 (default: fp32) | @@ -69,7 +69,7 @@ Argument description of quantize.py ([supported MatMul combinations](#supported- #### Supported Matrix Multiplication Data Types Combinations -Our Neural Speed supports INT4 / INT3 / INT2 / INT5 / INT6 / INT7 / INT8 / FP8 (E4M3, E5M2) / FP4 (E2M1) / NF4 weight-only quantization and FP32 / FP16 / BF16 / INT8 computation forward matmul on the Intel platforms. Here are the all supported data types combinations for matmul operations (quantization and forward). +Our Neural Speed supports INT4 / INT3 / INT2 / INT5 / INT6 / INT7 / INT1 / INT8 / FP8 (E4M3, E5M2) / FP4 (E2M1) / NF4 weight-only quantization and FP32 / FP16 / BF16 / INT8 computation forward matmul on the Intel platforms. Here are the all supported data types combinations for matmul operations (quantization and forward). > This table will be updated frequently due to active development. For details you can refer to [BesTLA](../bestla#weight-only) | Weight dtype | Compute dtype (default value) | Scale dtype (default value) | Quantization scheme (default value) | @@ -82,6 +82,7 @@ Our Neural Speed supports INT4 / INT3 / INT2 / INT5 / INT6 / INT7 / INT8 / FP8 | INT5 | INT8 / BF16 / FP16 / FP32 (FP32) | BF16 / FP32 (FP32) | sym / asym (sym) | | INT6 | INT8 / BF16 / FP16 / FP32 (FP32) | BF16 / FP32 (FP32) | sym / asym (sym) | | INT7 | INT8 / BF16 / FP16 / FP32 (FP32) | BF16 / FP32 (FP32) | sym / asym (sym) | +| INT1 | INT8 / BF16 / FP16 / FP32 (FP32) | BF16 / FP32 (FP32) | sym / asym (sym) | | FP8 (E4M3, E5M2) | BF16 / FP16 / FP32 (FP32) | FP8 (FP8) | sym (sym) | | FP4 (E2M1) | BF16 / FP16 / FP32 (FP32) | BF16 / FP32 (FP32) | sym (sym) | | NF4 | BF16 / FP16 / FP32 (FP32) | BF16 / FP32 (FP32) | sym (sym) | diff --git a/neural_speed/core/README.md b/neural_speed/core/README.md index 091fbdddd..ee4673e59 100644 --- a/neural_speed/core/README.md +++ b/neural_speed/core/README.md @@ -27,6 +27,7 @@ int2 | symmetric or asymmetric | multiplier of 8, -11 int5 | symmetric or asymmetric | multiplier of 8, -11 int6 | symmetric or asymmetric | multiplier of 8, -11 int7 | symmetric or asymmetric2 | multiplier of 8, -11 +int1 | symmetric or asymmetric | multiplier of 8, -11 int8 | symmetric | multiplier of 8, -11 fp4 | | multiplier of 8 nf4 | | multiplier of 8 @@ -84,7 +85,10 @@ Skylake | sym int3
group size=128
compute type=fp32 | AVX512F Alder Lake (12th Gen)
Raptor Lake (13th and 14th Gen)| sym int3
group size=128
compute type=int8 | AVX_VNNI Older architecture (before 12th Gen)| sym int3
group size=128
compute type=int8 | AVX2 +`sym int5 group=-1 comp_dtype=int8` is the fastest configuration for the first-token with good accuracy. +`sym int3 group=128 comp_dtype=int8` is the fastest configuration for the next-token with good accuracy. + NOTE: -1. group_size=-1 requires the INC's finetuned model, or it may have lower accuracy than small group sizes. It has the smallest model size, and the fastest first-token performance. +1. group_size=-1 has the smallest model size, and the fastest first-token performance. But it requires the INC's finetuned model, or it may have lower accuracy than small group sizes. It 2. group_size=128 is a balance of accuracy and speed if you want RTN quantization only. 3. group_size=32, scale_dtype=bf16, compute_dtype=int8, alg=sym equals llama.cpp's Q4_0.