From 6d21fa1cadf1e623e302eb04c15e4927febc8cf1 Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Date: Thu, 30 May 2024 22:02:11 -0400 Subject: [PATCH 01/19] [Kernel] Marlin_24: Ensure the mma.sp instruction is using the ::ordered_metadata modifier (introduced with PTX 8.5) (#5136) --- csrc/quantization/marlin/sparse/common/mma.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h index 45ab67a78a1de..fd3dbda5b9c93 100644 --- a/csrc/quantization/marlin/sparse/common/mma.h +++ b/csrc/quantization/marlin/sparse/common/mma.h @@ -32,7 +32,8 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, float* c = reinterpret_cast(&frag_c); if (psel == 0) { asm volatile( - "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " + "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16." + "f32 " "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " "{%12,%13,%14,%15}, %16, 0x0;\n" : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) @@ -40,7 +41,8 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]), "r"(e[0])); asm volatile( - "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " + "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16." + "f32 " "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " "{%12,%13,%14,%15}, %16, 0x0;\n" : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7]) @@ -49,7 +51,8 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, "r"(e[0])); } else { asm volatile( - "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " + "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16." + "f32 " "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " "{%12,%13,%14,%15}, %16, 0x1;\n" : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) @@ -57,7 +60,8 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]), "r"(e[0])); asm volatile( - "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " + "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16." + "f32 " "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " "{%12,%13,%14,%15}, %16, 0x1;\n" : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7]) From 533c2177925ba19934eab0095a50d0a783185e6b Mon Sep 17 00:00:00 2001 From: simon-mo Date: Fri, 31 May 2024 02:13:01 +0000 Subject: [PATCH 02/19] Fix cutlass sm_90a vesrion in CMakeList --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8df3a7a26d884..5f991af61d9bd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -200,7 +200,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # The CUTLASS kernels for Hopper require sm90a to be enabled. # This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a. # That adds an extra 17MB to compiled binary, so instead we selectively enable it. - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 11) + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0) set_source_files_properties( "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu" PROPERTIES From a22dea54d3e80bf069cfeed8002a193ef8b18e1b Mon Sep 17 00:00:00 2001 From: SnowDist Date: Fri, 31 May 2024 10:24:41 +0800 Subject: [PATCH 03/19] [Model] Support MAP-NEO model (#5081) Co-authored-by: Zhuohan Li --- benchmarks/kernels/benchmark_paged_attention.py | 2 +- benchmarks/kernels/benchmark_rope.py | 2 +- csrc/attention/attention_kernels.cu | 6 ++++++ csrc/cpu/attention.cpp | 6 ++++++ tests/kernels/test_attention.py | 2 +- tests/kernels/test_cache.py | 2 +- tests/kernels/test_pos_encoding.py | 2 +- vllm/attention/ops/paged_attn.py | 2 +- 8 files changed, 18 insertions(+), 6 deletions(-) diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index fc9621e885dc4..e6f4e9e6b9716 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -170,7 +170,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: parser.add_argument("--num-kv-heads", type=int, default=8) parser.add_argument("--head-size", type=int, - choices=[64, 80, 96, 112, 128, 256], + choices=[64, 80, 96, 112, 128, 192, 256], default=128) parser.add_argument("--block-size", type=int, choices=[16, 32], default=16) parser.add_argument("--use-alibi", action="store_true") diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 9188e811e2982..00e55f6060b52 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -93,7 +93,7 @@ def benchmark_rope_kernels_multi_lora( parser.add_argument("--num-heads", type=int, default=8) parser.add_argument("--head-size", type=int, - choices=[64, 80, 96, 112, 128, 256], + choices=[64, 80, 96, 112, 128, 192, 256], default=128) parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32) parser.add_argument("--dtype", diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index 45edc3252380c..8f89f89786c3b 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -754,6 +754,9 @@ void paged_attention_v1_launcher( case 128: LAUNCH_PAGED_ATTENTION_V1(128); break; + case 192: + LAUNCH_PAGED_ATTENTION_V1(192); + break; case 256: LAUNCH_PAGED_ATTENTION_V1(256); break; @@ -911,6 +914,9 @@ void paged_attention_v2_launcher( case 128: LAUNCH_PAGED_ATTENTION_V2(128); break; + case 192: + LAUNCH_PAGED_ATTENTION_V2(192); + break; case 256: LAUNCH_PAGED_ATTENTION_V2(256); break; diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp index 438e9bdb19f50..ed8cfbd421f0f 100644 --- a/csrc/cpu/attention.cpp +++ b/csrc/cpu/attention.cpp @@ -390,6 +390,9 @@ void paged_attention_v1_impl_launcher( case 128: LAUNCH_V1_ATTENTION_KERNEL(T, 128, BLOCK_SIZE); break; + case 192: + LAUNCH_V1_ATTENTION_KERNEL(T, 192, BLOCK_SIZE); + break; case 256: LAUNCH_V1_ATTENTION_KERNEL(T, 256, BLOCK_SIZE); break; @@ -703,6 +706,9 @@ void paged_attention_v2_impl_launcher( case 128: LAUNCH_V2_ATTENTION_KERNEL(T, 128, BLOCK_SIZE); break; + case 192: + LAUNCH_V2_ATTENTION_KERNEL(T, 192, BLOCK_SIZE); + break; case 256: LAUNCH_V2_ATTENTION_KERNEL(T, 256, BLOCK_SIZE); break; diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index fdf313262ca97..8bc4766fc93c4 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -28,7 +28,7 @@ # FlashAttention forward only supports head dimension at most 128 # https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62 -HEAD_SIZES = [64, 80, 96, 112, 128, 256 +HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256 ] if not is_hip() else [64, 80, 96, 112, 128] BLOCK_SIZES = [16, 32] diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 9f0cb60dc16e2..29572cfa57499 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -11,7 +11,7 @@ NUM_TOKENS = [42] # Arbitrary values for testing NUM_LAYERS = [1] # Arbitrary values for testing NUM_HEADS = [8] # Arbitrary values for testing -HEAD_SIZES = [64, 80, 96, 112, 128, 256] +HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256] BLOCK_SIZES = [8, 16, 32] # Arbitrary values for testing diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index 076730cdbae0d..fbabc02bf9a9d 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -10,7 +10,7 @@ IS_NEOX_STYLE = [True, False] DTYPES = [torch.half, torch.bfloat16, torch.float] -HEAD_SIZES = [64, 80, 96, 112, 128, 256] +HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256] ROTARY_DIMS = [None, 32] # None means rotary dim == head size NUM_HEADS = [7, 17] # Arbitrary values for testing BATCH_SIZES = [1, 5] # Arbitrary values for testing diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index e119fdcf11113..a214f40d16514 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -31,7 +31,7 @@ class PagedAttention: @staticmethod def get_supported_head_sizes() -> List[int]: - return [64, 80, 96, 112, 128, 256] + return [64, 80, 96, 112, 128, 192, 256] @staticmethod def get_kv_cache_shape( From e9d3aa04f6e55e2bb540f0810da97ddd0deebb13 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Fri, 31 May 2024 00:00:26 -0500 Subject: [PATCH 04/19] Revert "[Kernel] Marlin_24: Ensure the mma.sp instruction is using the ::ordered_metadata modifier (introduced with PTX 8.5)" (#5149) --- csrc/quantization/marlin/sparse/common/mma.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h index fd3dbda5b9c93..45ab67a78a1de 100644 --- a/csrc/quantization/marlin/sparse/common/mma.h +++ b/csrc/quantization/marlin/sparse/common/mma.h @@ -32,8 +32,7 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, float* c = reinterpret_cast(&frag_c); if (psel == 0) { asm volatile( - "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16." - "f32 " + "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " "{%12,%13,%14,%15}, %16, 0x0;\n" : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) @@ -41,8 +40,7 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]), "r"(e[0])); asm volatile( - "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16." - "f32 " + "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " "{%12,%13,%14,%15}, %16, 0x0;\n" : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7]) @@ -51,8 +49,7 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, "r"(e[0])); } else { asm volatile( - "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16." - "f32 " + "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " "{%12,%13,%14,%15}, %16, 0x1;\n" : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) @@ -60,8 +57,7 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]), "r"(e[0])); asm volatile( - "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16." - "f32 " + "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " "{%12,%13,%14,%15}, %16, 0x1;\n" : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7]) From a377f0bd5e1fa0ca069e3dbf28f4de5af64d0bb1 Mon Sep 17 00:00:00 2001 From: functionxu123 <1229853312@qq.com> Date: Fri, 31 May 2024 13:14:50 +0800 Subject: [PATCH 05/19] [Misc]: optimize eager mode host time (#4196) Co-authored-by: xuhao --- vllm/utils.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/vllm/utils.py b/vllm/utils.py index 26140e15636a4..2781eceb7ba98 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -17,6 +17,7 @@ Hashable, List, Optional, OrderedDict, Tuple, TypeVar, Union) +import numpy as np import psutil import torch @@ -501,11 +502,6 @@ def str_to_int_tuple(s: str) -> Tuple[int, ...]: f"(e.g., 1, 2, 3). Given input: {s}") from e -def pad_to_max_length(x: List[int], max_len: int, pad: int) -> List[int]: - assert len(x) <= max_len - return x + [pad] * (max_len - len(x)) - - def make_tensor_with_pad( x: List[List[int]], max_len: int, @@ -518,7 +514,10 @@ def make_tensor_with_pad( The padding is applied to the end of each inner list until it reaches `max_len`. """ - padded_x = [pad_to_max_length(x_i, max_len, pad) for x_i in x] + padded_x = np.zeros([len(x), max_len], dtype=np.int32) + pad + for ind, blocktb in enumerate(x): + assert len(blocktb) <= max_len + padded_x[ind, :len(blocktb)] = blocktb return torch.tensor(padded_x, dtype=dtype, device=device) From e9899fb7a4d9e032198d26ef84f1dd2cfd9621aa Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Fri, 31 May 2024 14:29:19 -0700 Subject: [PATCH 06/19] [Model] Enable FP8 QKV in MoE and refine kernel tuning script (#5039) --- benchmarks/kernels/benchmark_mixtral_moe.py | 48 ++++-- ...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 138 +++++++++++++++++ ...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 146 ++++++++++++++++++ ...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 108 +++++++------ ...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 146 ++++++++++++++++++ ...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 84 +++++----- ...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 146 ++++++++++++++++++ vllm/model_executor/models/mixtral.py | 9 -- 8 files changed, 711 insertions(+), 114 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json diff --git a/benchmarks/kernels/benchmark_mixtral_moe.py b/benchmarks/kernels/benchmark_mixtral_moe.py index 5280b214144c9..196ec8cfce88e 100644 --- a/benchmarks/kernels/benchmark_mixtral_moe.py +++ b/benchmarks/kernels/benchmark_mixtral_moe.py @@ -11,25 +11,36 @@ from vllm.model_executor.layers.fused_moe import (fused_moe, get_config_file_name) -os.environ['CUDA_VISIBLE_DEVICES'] = '0' - -def main(dtype: str): +def main(model, tp_size, gpu, dtype: str): + os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu) method = fused_moe for bs in [ 1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536, 2048, 3072, 4096 ]: - run_grid(bs, method=method, dtype=dtype) - - -def run_grid(bs, method, dtype: str): - d_model = 4096 + run_grid(bs, + model=model, + method=method, + gpu=gpu, + tp_size=tp_size, + dtype=dtype) + + +def run_grid(bs, model, method, gpu, tp_size, dtype: str): + if model == '8x7B': + d_model = 4096 + model_intermediate_size = 14336 + num_layers = 32 + elif model == '8x22B': + d_model = 6144 + model_intermediate_size = 16384 + num_layers = 56 + else: + raise ValueError(f'Unsupported Mixtral model {model}') num_total_experts = 8 top_k = 2 - tp_size = 2 - model_intermediate_size = 14336 - num_layers = 32 + # tp_size = 2 num_calls = 100 num_warmup_trials = 1 @@ -211,5 +222,18 @@ def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int, choices=['float8', 'float16'], help='Data type used for fused_moe kernel computations', ) + parser.add_argument('--model', + type=str, + default='8x7B', + choices=['8x7B', '8x22B'], + help='The Mixtral model to benchmark') + parser.add_argument('--tp-size', + type=int, + default=2, + help='Tensor paralleli size') + parser.add_argument('--gpu', + type=int, + default=0, + help="GPU ID for benchmarking") args = parser.parse_args() - sys.exit(main(args.dtype)) + sys.exit(main(args.model, args.tp_size, args.gpu, args.dtype)) diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json new file mode 100644 index 0000000000000..3f3ccdafa88f3 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json @@ -0,0 +1,138 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json new file mode 100644 index 0000000000000..0c495e7e290c6 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json index 9287808a94d0e..5b78c30f08b68 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json @@ -3,61 +3,59 @@ "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 1 + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 }, "2": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 1 + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 }, "4": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 1 + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 }, "8": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 1, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, "num_warps": 8, - "num_stages": 5 + "num_stages": 2 }, "16": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 16, - "num_warps": 4, - "num_stages": 5 - }, - "24": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5 }, - "32": { + "24": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, - "num_warps": 8, + "GROUP_SIZE_M": 64, + "num_warps": 4, "num_stages": 4 }, - "48": { + "32": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, - "num_warps": 8, - "num_stages": 3 + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 }, - "64": { + "48": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, @@ -65,37 +63,45 @@ "num_warps": 4, "num_stages": 4 }, - "96": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 64, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 32, - "num_warps": 8, - "num_stages": 2 - }, - "128": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 3 + "num_stages": 2 }, - "256": { + "96": { "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5 }, - "512": { + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "256": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 64, - "num_warps": 4, - "num_stages": 2 + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 }, "1024": { "BLOCK_SIZE_M": 128, @@ -109,7 +115,7 @@ "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4 }, @@ -125,7 +131,7 @@ "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4 }, diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json new file mode 100644 index 0000000000000..60a65724d68b9 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json index 2ad07bf79a25c..75f8b0017b9c6 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json @@ -2,104 +2,104 @@ "1": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 4 + "num_stages": 5 }, "2": { - "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, + "GROUP_SIZE_M": 32, + "num_warps": 8, "num_stages": 4 }, "4": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 4 + "num_stages": 2 }, "8": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 64, - "num_warps": 8, - "num_stages": 4 + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 }, "16": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 8, + "num_warps": 4, "num_stages": 4 }, "24": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 4 + "num_warps": 4, + "num_stages": 5 }, "32": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, - "num_warps": 8, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, "num_stages": 4 }, "48": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 4 + "num_stages": 3 }, "64": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, - "num_warps": 8, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, "num_stages": 4 }, "96": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4 }, "128": { "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, - "num_warps": 8, - "num_stages": 4 + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 }, "256": { "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 4 + "num_stages": 5 }, "512": { "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4 }, @@ -115,7 +115,7 @@ "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4 }, @@ -139,7 +139,7 @@ "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4 } diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json new file mode 100644 index 0000000000000..34b916e574f88 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index d6dd7fa1fe9e2..2f4237339486e 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -278,15 +278,6 @@ def __init__( self.scaling = self.head_dim**-0.5 self.rope_theta = rope_theta - if isinstance( - quant_config, - Fp8Config) and not quant_config.is_checkpoint_fp8_serialized: - print_warning_once( - "For Mixtral FP8 quantization, we currently do not quantize " - "the attention layers until their FP8 performance is improved." - ) - quant_config = None - self.qkv_proj = QKVParallelLinear( hidden_size, self.head_dim, From 657579113f714c2e74bca373ecfb6c2c245b4101 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 31 May 2024 17:20:19 -0700 Subject: [PATCH 07/19] [Doc] Add checkmark for GPTBigCodeForCausalLM LoRA support (#5171) --- docs/source/models/supported_models.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index e4bae80343a2c..82e71e61975c8 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -62,7 +62,7 @@ Alongside each architecture, we include some popular models that use it. * - :code:`GPTBigCodeForCausalLM` - StarCoder, SantaCoder, WizardCoder - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc. - - + - ✅︎ * - :code:`GPTJForCausalLM` - GPT-J - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc. From 1197e02141df1a7442f21ff6922c98ec0bba153e Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Fri, 31 May 2024 20:21:38 -0400 Subject: [PATCH 08/19] [Build] Guard against older CUDA versions when building CUTLASS 3.x kernels (#5168) --- csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu | 10 ++++++++-- csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu | 11 ++++++++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu index 5fd6d8ff20867..531414bc45165 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu @@ -1,3 +1,9 @@ +// clang-format will break include orders +// clang-format off +#include + +#if defined CUDA_VERSION && CUDA_VERSION >= 12000 + #include #include @@ -6,8 +12,6 @@ #include #include -// clang-format will break include orders -// clang-format off #include "cutlass/cutlass.h" #include "cute/tensor.hpp" @@ -241,3 +245,5 @@ void cutlass_scaled_mm_dq_sm90(torch::Tensor& out, torch::Tensor const& a, } } } + +#endif diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu index dab73ac6c831e..eb532f2ac7a9b 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu @@ -1,5 +1,6 @@ +#include + #include -#include #include void cutlass_scaled_mm_dq_sm75(torch::Tensor& c, torch::Tensor const& a, @@ -17,10 +18,12 @@ void cutlass_scaled_mm_dq_sm89(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales); +#if defined CUDA_VERSION && CUDA_VERSION >= 12000 void cutlass_scaled_mm_dq_sm90(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales); +#endif void cutlass_scaled_mm_dq(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, @@ -51,7 +54,13 @@ void cutlass_scaled_mm_dq(torch::Tensor& c, torch::Tensor const& a, if (version_num >= 90) { // Hopper + + // Guard against compilation issues for sm90 kernels +#if defined CUDA_VERSION && CUDA_VERSION >= 12000 cutlass_scaled_mm_dq_sm90(c, a, b, a_scales, b_scales); +#else + cutlass_scaled_mm_dq_sm80(c, a, b, a_scales, b_scales); +#endif } else if (version_num == 89) { // Ada Lovelace cutlass_scaled_mm_dq_sm89(c, a, b, a_scales, b_scales); From a360ff80bb34f9dfcd21cf880c2030daa2d6b3a3 Mon Sep 17 00:00:00 2001 From: Daniele Date: Sat, 1 Jun 2024 06:06:45 +0200 Subject: [PATCH 09/19] [CI/Build] CMakeLists: build all extensions' cmake targets at the same time (#5034) --- setup.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index b4baebb0d4801..d99fc050f6d84 100644 --- a/setup.py +++ b/setup.py @@ -187,19 +187,22 @@ def build_extensions(self) -> None: if not os.path.exists(self.build_temp): os.makedirs(self.build_temp) + targets = [] # Build all the extensions for ext in self.extensions: self.configure(ext) + targets.append(remove_prefix(ext.name, "vllm.")) - ext_target_name = remove_prefix(ext.name, "vllm.") - num_jobs, _ = self.compute_num_jobs() + num_jobs, _ = self.compute_num_jobs() - build_args = [ - '--build', '.', '--target', ext_target_name, '-j', - str(num_jobs) - ] + build_args = [ + "--build", + ".", + f"-j={num_jobs}", + *[f"--target={name}" for name in targets], + ] - subprocess.check_call(['cmake', *build_args], cwd=self.build_temp) + subprocess.check_call(["cmake", *build_args], cwd=self.build_temp) def _is_cuda() -> bool: From 260d119e864edbf023b1be7fa446a08bbea11f80 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Sat, 1 Jun 2024 02:45:32 -0400 Subject: [PATCH 10/19] [Kernel] Refactor CUTLASS kernels to always take scales that reside on the GPU (#5137) --- ...ue.hpp => broadcast_load_epilogue_c2x.hpp} | 50 ++- .../broadcast_load_epilogue_c3x.hpp | 389 ++++++++++++++++++ .../cutlass_w8a8/scaled_mm_dq_c2x.cu | 14 +- .../cutlass_w8a8/scaled_mm_dq_c3x.cu | 20 +- pyproject.toml | 2 +- tests/kernels/test_cutlass.py | 13 +- .../compressed_tensors_w8a8_statictensor.py | 33 +- 7 files changed, 445 insertions(+), 76 deletions(-) rename csrc/quantization/cutlass_w8a8/{cutlass_visitor_2x_broadcast_epilogue.hpp => broadcast_load_epilogue_c2x.hpp} (86%) create mode 100644 csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp diff --git a/csrc/quantization/cutlass_w8a8/cutlass_visitor_2x_broadcast_epilogue.hpp b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp similarity index 86% rename from csrc/quantization/cutlass_w8a8/cutlass_visitor_2x_broadcast_epilogue.hpp rename to csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp index ddbee15e54ab6..c4c6b18654eed 100644 --- a/csrc/quantization/cutlass_w8a8/cutlass_visitor_2x_broadcast_epilogue.hpp +++ b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp @@ -33,20 +33,27 @@ // // This file is a modified excerpt of // include/cutlass/epilogue/fusion/visitor_load.hpp from -// https://github.com/NVIDIA/cutlass It's beem modified to support either -// row/column or scalar broadcasting, like is already supported in CUTLASS 3.x. -// Important because this saves us a factor 4x on the number of kernels -// compiled. +// https://github.com/NVIDIA/cutlass v3.5.0 +// It has been modified to support either +// row/column or scalar broadcasting where the tensor being loaded from is +// always passed in via a device pointer. This lets one compiled kernel handle +// all cases of per-tensor or per-channel/per-token quantization. +// +// This interface also allows the scales to be passed in as tensors that +// consistently reside on the device, which avoids an issue with a previous +// implementation where scalars needed to be on the CPU since they +// were passed in via float values. This created a potential performance hazard +// if scales were initially on the device, and caused torch.compile graph +// breaks when moving scales to the CPU. // #pragma once +// Turn off clang-format for the entire file to keep it close to upstream // clang-format off #include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp" #include "cute/tensor.hpp" -// clang-format on - namespace cutlass::epilogue::threadblock { using namespace cute; @@ -59,9 +66,11 @@ template< > struct VisitorRowOrScalarBroadcast { + // This struct has been modified to have a bool indicating that ptr_row is a + // scalar that must be broadcast. struct Arguments { Element const* ptr_row = nullptr; - Element null_default = Element(0); + bool row_broadcast = true; StrideMNL dRow = {}; }; @@ -125,25 +134,25 @@ struct VisitorRowOrScalarBroadcast { auto coord_v = filter(tC_cRow); auto dst_v = filter(tC_rRow); - if (params_ptr->ptr_row) { + if (params_ptr->row_broadcast) { // In this case we are loading from a row vector and broadcasting CUTLASS_PRAGMA_UNROLL for (int i = 0; i < size(src_v); ++i) { bool guard = get<1>(coord_v(i)) < n; - cutlass::arch::global_load(dst_v(i), (void const*)&src_v(i), guard); + cutlass::arch::global_load( + dst_v(i), (void const*)&src_v(i), guard); } } else { // In this case we are loading from a scalar and broadcasting VecType filled_vec; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < VecLength; i++) { - reinterpret_cast(&filled_vec)[i] = params_ptr->null_default; + reinterpret_cast(&filled_vec)[i] = *(params_ptr->ptr_row); } CUTLASS_PRAGMA_UNROLL for (int i = 0; i < size(src_v); ++i) { - if(get<1>(coord_v(i)) < n) - { + if (get<1>(coord_v(i)) < n) { dst_v(i) = filled_vec; } } @@ -208,9 +217,11 @@ template< > struct VisitorColOrScalarBroadcast { + // This struct has been modified to have a bool indicating that ptr_col is a + // scalar that must be broadcast. struct Arguments { Element const* ptr_col = nullptr; - Element null_default = Element(0); + bool col_broadcast = true; StrideMNL dCol = {}; }; @@ -230,11 +241,6 @@ struct VisitorColOrScalarBroadcast { struct SharedStorage { }; - // Global load type - static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits::value; - using VecType = uint_bit_t; - static int constexpr VecLength = sizeof(VecType) / sizeof(Element); - CUTLASS_HOST_DEVICE VisitorColOrScalarBroadcast() { } @@ -267,7 +273,7 @@ struct VisitorColOrScalarBroadcast { int m; // This function is modified from VisitorColBroadcast - CUTLASS_DEVICE void + CUTLASS_DEVICE void begin_epilogue() { clear(tC_rCol); @@ -277,7 +283,7 @@ struct VisitorColOrScalarBroadcast { pred(i) = get<0>(tC_cCol(i)) < m; } - if (params_ptr->ptr_col) { + if (params_ptr->col_broadcast) { // In this case we are loading from a column vector and broadcasting copy_if(pred, tC_gCol, tC_rCol); } else { @@ -286,8 +292,8 @@ struct VisitorColOrScalarBroadcast { CUTLASS_PRAGMA_UNROLL for (int i = 0; i < size(dst_v); ++i) { - if(pred(i)){ - dst_v(i) = params_ptr->null_default; + if (pred(i)) { + dst_v(i) = *(params_ptr->ptr_col); } } } diff --git a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp new file mode 100644 index 0000000000000..8f38bbf507901 --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp @@ -0,0 +1,389 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights + *reserved. SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + *POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +// +// This file is a modified excerpt of +// include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp +// from https://github.com/NVIDIA/cutlass v3.5.0 +// It has been modified to support either row/column or scalar broadcasting +// where the tensor being loaded from is always passed in via a device pointer. +// This lets one compiled kernel handle all cases of per-tensor or +// per-channel/per-token quantization. +// +// This interface also allows the scales to be passed in as tensors that +// consistently reside on the device, which avoids an issue with a previous +// implementation where scalars needed to be on the CPU since they +// were passed in via float values. This created a potential performance hazard +// if scales were initially on the device, and caused torch.compile graphs +// breaks when moving scales to the CPU. +// +#pragma once + +// Turn off clang-format for the entire file to keep it close to upstream +// clang-format off + +#include "cutlass/cutlass.h" +#include "cutlass/arch/barrier.h" + +#include "cute/tensor.hpp" +#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp" + +namespace cutlass::epilogue::fusion { + +using namespace cute; +using namespace detail; + +// Row vector broadcast +template< + // Row bcast reuses the mbarriers from the epilogue subtile load pipeline, so this must be at least + // ceil_div(StagesC, epi tiles per CTA tile) + 1 to ensure no data races + int Stages, + class CtaTileShapeMNK, + class Element, + class StrideMNL = Stride<_0,_1,_0>, + int Alignment = 128 / sizeof_bits_v +> +struct Sm90RowOrScalarBroadcast { + static_assert(Alignment * sizeof_bits_v % 128 == 0, "sub-16B alignment not supported yet"); + static_assert( + (cute::is_same_v>) || // row vector broadcast, e.g. per-col alpha/bias + (cute::is_same_v>)); // batched row vector broadcast + + // Accumulator doesn't distribute row elements evenly amongst threads so we must buffer in smem + struct SharedStorage { + alignas(16) array_aligned(CtaTileShapeMNK{}) * Stages> smem_row; + }; + + // This struct has been modified to have a bool indicating that ptr_row is a + // scalar that must be broadcast, instead of containing a scalar that is + // valid if ptr_row is null. + struct Arguments { + Element const* ptr_row = nullptr; + bool row_broadcast = true; + StrideMNL dRow = {}; + }; + + using Params = Arguments; + + template + static constexpr Params + to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) { + return args; + } + + template + static size_t + get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) { + return 0; + } + + template + static cutlass::Status + initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, + CudaHostAdapter* cuda_adapter = nullptr) { + return cutlass::Status::kSuccess; + } + + CUTLASS_HOST_DEVICE + Sm90RowOrScalarBroadcast() { } + + CUTLASS_HOST_DEVICE + Sm90RowOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage) + : params(params), + smem_row(const_cast(shared_storage.smem_row.data())) { } + + Params params; + Element* smem_row; + + CUTLASS_DEVICE bool + is_producer_load_needed() const { + return true; + } + + CUTLASS_DEVICE bool + is_C_load_needed() const { + return false; + } + + CUTLASS_DEVICE bool + is_zero() const { + return (!params.row_broadcast && *(params.ptr_row) == Element(0)); + } + + template + struct ProducerLoadCallbacks : EmptyProducerLoadCallbacks { + CUTLASS_DEVICE + ProducerLoadCallbacks(GTensor&& gRow, STensor&& sRow, Params const& params) + : gRow(cute::forward(gRow)), + sRow(cute::forward(sRow)), + params(params) {} + + GTensor gRow; // (CTA_M,CTA_N) + STensor sRow; // (CTA_M,CTA_N,PIPE) + Params const& params; + + CUTLASS_DEVICE void + begin(uint64_t* full_mbarrier_ptr, int load_iteration, bool issue_tma_load) { + if (params.ptr_row == nullptr) { + return; + } + + if (issue_tma_load) { + // Increment the expect-tx count of the first subtile's mbarrier by the row vector's byte-size + constexpr uint32_t copy_bytes = size<1>(CtaTileShapeMNK{}) * sizeof_bits_v / 8; + cutlass::arch::ClusterTransactionBarrier::expect_transaction(full_mbarrier_ptr, copy_bytes); + // Issue the TMA bulk copy + auto bulk_copy = Copy_Atom{}.with(*full_mbarrier_ptr); + // Filter so we don't issue redundant copies over stride-0 modes + int bcast_pipe_index = (load_iteration / EpiTiles) % Stages; + copy(bulk_copy, filter(gRow), filter(sRow(_,_,bcast_pipe_index))); + } + } + }; + + template + CUTLASS_DEVICE auto + get_producer_load_callbacks(ProducerLoadArgs const& args) { + + auto [M, N, K, L] = args.problem_shape_mnkl; + auto [m, n, k, l] = args.tile_coord_mnkl; + Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row), make_shape(M,N,L), params.dRow); + Tensor gRow = local_tile(mRow, take<0,2>(args.tile_shape_mnk), make_coord(m,n,l)); // (CTA_M,CTA_N) + Tensor sRow = make_tensor(make_smem_ptr(smem_row), // (CTA_M,CTA_N,PIPE) + make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{}), Stages), + make_stride(_0{},_1{},size<1>(CtaTileShapeMNK{}))); + + constexpr int EpiTiles = decltype(size<1>(zipped_divide(make_layout(take<0,2>(args.tile_shape_mnk)), args.epi_tile)))::value; + return ProducerLoadCallbacks( + cute::move(gRow), cute::move(sRow), params); + } + + template + struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks { + CUTLASS_DEVICE + ConsumerStoreCallbacks(RTensor&& tCrRow, STensor&& tCsRow, Params const& params) + : tCrRow(cute::forward(tCrRow)), + tCsRow(cute::forward(tCsRow)), + params(params) {} + + RTensor tCrRow; // (CPY,CPY_M,CPY_N) + STensor tCsRow; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,PIPE) + Params const& params; + + CUTLASS_DEVICE void + previsit(int epi_m, int epi_n, int load_iteration, bool is_producer_load_needed) { + if (!params.row_broadcast) { + fill(tCrRow, *(params.ptr_row)); + return; + } + + if (epi_m == 0) { // Assumes M-major subtile loop + // Filter so we don't issue redundant copies over stride-0 modes + // (only works if 0-strides are in same location, which is by construction) + int bcast_pipe_index = (load_iteration / EpiTiles) % Stages; + copy_aligned(filter(tCsRow(_,_,_,epi_m,epi_n,bcast_pipe_index)), filter(tCrRow)); + } + } + + template + CUTLASS_DEVICE Array + visit(Array const& frg_acc, int epi_v, int epi_m, int epi_n) { + Array frg_row; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < FragmentSize; ++i) { + frg_row[i] = tCrRow(epi_v * FragmentSize + i); + } + + return frg_row; + } + }; + + template < + bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy + class... Args + > + CUTLASS_DEVICE auto + get_consumer_store_callbacks(ConsumerStoreArgs const& args) { + + Tensor sRow = make_tensor(make_smem_ptr(smem_row), // (CTA_M,CTA_N,PIPE) + make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{}), Stages), + make_stride(_0{},_1{},size<1>(CtaTileShapeMNK{}))); + Tensor tCsRow = sm90_partition_for_epilogue( // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,PIPE) + sRow, args.epi_tile, args.tiled_copy, args.thread_idx); + Tensor tCrRow = make_tensor_like(take<0,3>(tCsRow)); // (CPY,CPY_M,CPY_N) + + constexpr int EpiTiles = decltype(size<1>(zipped_divide(make_layout(take<0,2>(args.tile_shape_mnk)), args.epi_tile)))::value; + return ConsumerStoreCallbacks( + cute::move(tCrRow), cute::move(tCsRow), params); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +// Column vector broadcast +template< + int Stages, + class CtaTileShapeMNK, + class Element, + class StrideMNL = Stride<_1,_0,_0>, + int Alignment = 128 / sizeof_bits_v +> +struct Sm90ColOrScalarBroadcast { + static_assert(Stages == 0, "Column broadcast doesn't support smem usage yet"); + static_assert(Alignment * sizeof_bits_v % 128 == 0, "sub-16B alignment not supported yet"); + static_assert( + (cute::is_same_v>) || // col vector broadcast, e.g. per-row alpha/bias + (cute::is_same_v>)); // batched col vector broadcast, e.g. batched per-row bias + + // Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem + struct SharedStorage { }; + + // This struct has been modified to have a bool indicating that ptr_col is a + // scalar that must be broadcast, instead of containing a scalar that is + // valid if ptr_col is null. + struct Arguments { + Element const* ptr_col = nullptr; + bool col_broadcast = true; + StrideMNL dCol = {}; + }; + + using Params = Arguments; + + template + static constexpr Params + to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) { + return args; + } + + template + static size_t + get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) { + return 0; + } + + template + static cutlass::Status + initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, + CudaHostAdapter* cuda_adapter = nullptr) { + return cutlass::Status::kSuccess; + } + + CUTLASS_DEVICE bool + is_producer_load_needed() const { + return false; + } + + CUTLASS_DEVICE bool + is_C_load_needed() const { + return false; + } + + CUTLASS_DEVICE bool + is_zero() const { + return (!params.col_broadcast && *(params.ptr_col) == Element(0)); + } + + CUTLASS_HOST_DEVICE + Sm90ColOrScalarBroadcast() { } + + CUTLASS_HOST_DEVICE + Sm90ColOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage) + : params(params) { } + + Params params; + + template + CUTLASS_DEVICE auto + get_producer_load_callbacks(ProducerLoadArgs const& args) { + return EmptyProducerLoadCallbacks{}; + } + + template + struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks { + CUTLASS_DEVICE + ConsumerStoreCallbacks(GTensor&& tCgCol, RTensor&& tCrCol, Params const& params) + : tCgCol(cute::forward(tCgCol)), + tCrCol(cute::forward(tCrCol)), + params(params) {} + + GTensor tCgCol; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) + RTensor tCrCol; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) + Params const& params; + + CUTLASS_DEVICE void + begin() { + if (!params.col_broadcast) { + fill(tCrCol, *(params.ptr_col)); + return; + } + + // Filter so we don't issue redundant copies over stride-0 modes + // (only works if 0-strides are in same location, which is by construction) + copy_aligned(filter(tCgCol), filter(tCrCol)); + } + + template + CUTLASS_DEVICE Array + visit(Array const& frg_acc, int epi_v, int epi_m, int epi_n) { + Array frg_col; + Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < FragmentSize; ++i) { + frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i); + } + + return frg_col; + } + + }; + + template < + bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy + class... Args + > + CUTLASS_DEVICE auto + get_consumer_store_callbacks(ConsumerStoreArgs const& args) { + + auto [M, N, K, L] = args.problem_shape_mnkl; + Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col), make_shape(M,N,L), params.dCol); + Tensor tCgCol = sm90_partition_for_epilogue( // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) + mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx); + Tensor tCrCol = make_tensor_like(tCgCol); // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) + + return ConsumerStoreCallbacks( + cute::move(tCgCol), cute::move(tCrCol), params); + } +}; + +} diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu index 3a6b8a226e18c..65870df0e8fcd 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu @@ -22,7 +22,7 @@ #include "cutlass/epilogue/threadblock/fusion/visitors.hpp" #include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h" -#include "cutlass_visitor_2x_broadcast_epilogue.hpp" +#include "broadcast_load_epilogue_c2x.hpp" #include "common.hpp" // clang-format on @@ -145,17 +145,11 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a, auto a_scales_ptr = a_scales.data_ptr(); auto b_scales_ptr = b_scales.data_ptr(); - // If A and B are quantized per-tensor, then these scale tensors are scalars, - // and they are passed in via the second argument. using ScaleAArgs = typename Gemm::ScaleA::Arguments; - ScaleAArgs a_args = a_scales.numel() == 1 - ? ScaleAArgs{nullptr, a_scales.item(), {}} - : ScaleAArgs{a_scales.data_ptr(), {}, {}}; - using ScaleBArgs = typename Gemm::ScaleB::Arguments; - ScaleBArgs b_args = b_scales.numel() == 1 - ? ScaleBArgs{nullptr, b_scales.item(), {}} - : ScaleBArgs{b_scales.data_ptr(), {}, {}}; + + ScaleBArgs b_args{b_scales.data_ptr(), b_scales.numel() != 1, {}}; + ScaleAArgs a_args{a_scales.data_ptr(), a_scales.numel() != 1, {}}; typename Gemm::EVTCompute0::Arguments evt0_compute_args{b_args}; diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu index 531414bc45165..2383760abcdb0 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu @@ -18,11 +18,14 @@ #include "cute/atom/mma_atom.hpp" #include "cutlass/numeric_types.h" +#include "cutlass/util/device_memory.h" + #include "cutlass/gemm/device/gemm_universal_adapter.h" #include "cutlass/gemm/kernel/gemm_universal.hpp" #include "cutlass/epilogue/collective/collective_builder.hpp" #include "cutlass/gemm/collective/collective_builder.hpp" +#include "broadcast_load_epilogue_c3x.hpp" #include "common.hpp" // clang-format on @@ -65,7 +68,7 @@ struct cutlass_3x_gemm { using Accum = cutlass::epilogue::fusion::Sm90AccFetch; - using ScaleA = cutlass::epilogue::fusion::Sm90ColBroadcast< + using ScaleA = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast< 0 /*Stages*/, typename EpilogueDescriptor::TileShape, float, Stride, Int<0>, Int<0>>>; @@ -73,7 +76,7 @@ struct cutlass_3x_gemm { cutlass::epilogue::collective::detail::RowBroadcastDescriptor< EpilogueDescriptor, float>; - using ScaleB = cutlass::epilogue::fusion::Sm90RowBroadcast< + using ScaleB = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast< ScaleBDescriptor::Stages, typename EpilogueDescriptor::TileShape, typename ScaleBDescriptor::Element, Stride, Int<1>, Int<0>>>; @@ -166,13 +169,9 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a, using ScaleA_Args = typename Gemm::ScaleA::Arguments; using ScaleB_Args = typename Gemm::ScaleB::Arguments; - ScaleA_Args a_args = a_scales.numel() == 1 - ? ScaleA_Args{nullptr, a_scales.item(), {}} - : ScaleA_Args{a_scales.data_ptr(), {}, {}}; - ScaleB_Args b_args = b_scales.numel() == 1 - ? ScaleB_Args{nullptr, b_scales.item(), {}} - : ScaleB_Args{b_scales.data_ptr(), {}, {}}; + ScaleA_Args a_args{a_scales.data_ptr(), a_scales.numel() != 1, {}}; + ScaleB_Args b_args{b_scales.data_ptr(), b_scales.numel() != 1, {}}; args.epilogue.thread = {a_args, {b_args}}; @@ -182,10 +181,11 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a, CUTLASS_CHECK(gemm_op.can_implement(args)); size_t workspace_size = gemm_op.get_workspace_size(args); - TORCH_CHECK(workspace_size == 0); + cutlass::device_memory::allocation workspace(workspace_size); auto stream = at::cuda::getCurrentCUDAStream(a.get_device()); - cutlass::Status status = gemm_op.run(args, stream); + + cutlass::Status status = gemm_op.run(args, workspace.get(), stream); CUTLASS_CHECK(status); } } // namespace diff --git a/pyproject.toml b/pyproject.toml index 0e9096fb4c035..06f150009aa81 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,7 @@ exclude = [ ] [tool.codespell] -ignore-words-list = "dout, te, indicies" +ignore-words-list = "dout, te, indicies, subtile" skip = "./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build" [tool.isort] diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index 2cf0e86e5ca44..5a18dd5c1e3b3 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -207,14 +207,21 @@ def forward(self, a): self.out_dtype) -def test_cutlass_cuda_graph(): +@pytest.mark.parametrize("per_act_token", [True, False]) +@pytest.mark.parametrize("per_out_ch", [True, False]) +def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool): m, n, k = 512, 512, 512 a = to_int8(torch.randn((m, k), device="cuda")) b = to_int8(torch.randn((n, k), device="cuda").t()) - scale_a = (torch.randn((m, 1), device="cuda", dtype=torch.float32) / 10) - scale_b = (torch.randn((1, n), device="cuda", dtype=torch.float32) / 10) + m_a_scales = m if per_act_token else 1 + n_b_scales = n if per_out_ch else 1 + + scale_a = (torch.randn( + (m_a_scales, 1), device="cuda", dtype=torch.float32) / 10) + scale_b = (torch.randn( + (1, n_b_scales), device="cuda", dtype=torch.float32) / 10) # Construct a trivial model with a single layer that calls a CUTLASS kernel model = CutlassLayer(b, scale_a, scale_b, torch.bfloat16) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py index 64a88b01cd260..7e3e932cfe14a 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py @@ -41,46 +41,19 @@ def create_weights(self, layer: torch.nn.Module, # TODO: remove zero_point parameters once the configs given remove them - # Note on input/weight scales and zero_points - # - # When the scales have a single value, it is required that they be - # on the CPU for 2 reasons, - # 1. Performance: - # When the scales (input_scale/weight_scales) have only a single - # value, we perform a scalar broadcast of that value during the - # quant/dequant operations. The "quant" and the "gemm+dequant" - # kernels accept the Scalar by-value. These tensors are allocated - # on the CPU in order to avoid the GPU-to-CPU copy when passing - # by-value. - # - # 2. CUDA Graphs: - # CUDA Graphs don't support GPU-to-CPU copy operations during - # stream capture. - # - # TODO: zero-points are not supported yet. But we expect a similar - # pattern. - is_tensor_partitioned = len(output_partition_sizes) != 1 weight_scale_dim = sum( output_partition_sizes) if is_tensor_partitioned else 1 - weight_scale_device = "cpu" if weight_scale_dim == 1 else "cuda" - input_scale = Parameter(torch.empty(1, - device="cpu", - dtype=torch.float32), + input_scale = Parameter(torch.empty(1, dtype=torch.float32), requires_grad=False) - input_zero_point = Parameter(torch.empty(1, - device="cpu", - dtype=torch.int8), + input_zero_point = Parameter(torch.empty(1, dtype=torch.int8), requires_grad=False) weight_scale = Parameter(torch.empty(weight_scale_dim, - device=weight_scale_device, dtype=torch.float32), requires_grad=False) - weight_zero_point = Parameter(torch.empty(1, - device="cpu", - dtype=torch.int8), + weight_zero_point = Parameter(torch.empty(1, dtype=torch.int8), requires_grad=False) weight = Parameter(torch.empty(sum(output_partition_sizes), From f081c3ce4b020fb094e33575d178345c477ab0c6 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Sat, 1 Jun 2024 14:16:07 +0530 Subject: [PATCH 11/19] [Kernel] Update Cutlass fp8 configs (#5144) Co-authored-by: Varun Sundar Rabindranath Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> --- .../cutlass_benchmarks/w8a8_benchmarks.py | 352 ++++++++++++++++++ .../cutlass_benchmarks/weight_shapes.py | 37 ++ .../cutlass_w8a8/scaled_mm_dq_c3x.cu | 104 +++++- tests/kernels/test_cutlass.py | 2 +- 4 files changed, 480 insertions(+), 15 deletions(-) create mode 100644 benchmarks/cutlass_benchmarks/w8a8_benchmarks.py create mode 100644 benchmarks/cutlass_benchmarks/weight_shapes.py diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py new file mode 100644 index 0000000000000..6de56f618700d --- /dev/null +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -0,0 +1,352 @@ +import argparse +import copy +import itertools +import pickle as pkl +import time +from typing import Callable, Iterable, List, Tuple + +import torch +import torch.utils.benchmark as TBenchmark +from torch.utils.benchmark import Measurement as TMeasurement +from weight_shapes import WEIGHT_SHAPES + +from vllm import _custom_ops as ops + +DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:] +DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] +DEFAULT_TP_SIZES = [1] + +# helpers + + +def to_fp8(tensor: torch.tensor) -> torch.tensor: + finfo = torch.finfo(torch.float8_e4m3fn) + return torch.round(tensor.clamp( + min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) + + +def to_int8(tensor: torch.tensor) -> torch.tensor: + return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8) + + +def make_rand_tensors(dtype: torch.dtype, m: int, n: int, + k: int) -> Tuple[torch.tensor, torch.tensor]: + + a = torch.randn((m, k), device='cuda') * 5 + b = torch.randn((n, k), device='cuda').t() * 5 + + if dtype == torch.int8: + return to_int8(a), to_int8(b) + if dtype == torch.float8_e4m3fn: + return to_fp8(a), to_fp8(b) + + raise ValueError("unsupported dtype") + + +# impl + + +def pytorch_i8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor, + scale_b: torch.tensor, + out_dtype: torch.dtype) -> torch.tensor: + return torch.mm(a, b) + + +def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor, + scale_b: torch.tensor, + out_dtype: torch.dtype) -> torch.tensor: + return torch._scaled_mm(a, + b, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=out_dtype) + + +def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor, + scale_a: torch.tensor, scale_b: torch.tensor, + out_dtype: torch.dtype) -> torch.tensor: + return torch._scaled_mm(a, + b, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=out_dtype, + use_fast_accum=True) + + +def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor, + scale_b: torch.tensor, + out_dtype: torch.dtype) -> torch.tensor: + return ops.cutlass_scaled_mm_dq(a, + b, + scale_a, + scale_b, + out_dtype=out_dtype) + + +# bench +def bench_fn(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor, + scale_b: torch.tensor, out_dtype: torch.dtype, label: str, + sub_label: str, fn: Callable, description: str) -> TMeasurement: + + min_run_time = 1 + + globals = { + "a": a, + "b": b, + "scale_a": scale_a, + "scale_b": scale_b, + "out_dtype": out_dtype, + "fn": fn, + } + return TBenchmark.Timer( + stmt="fn(a, b, scale_a, scale_b, out_dtype)", + globals=globals, + label=label, + sub_label=sub_label, + description=description, + ).blocked_autorange(min_run_time=min_run_time) + + +def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str, + sub_label: str) -> Iterable[TMeasurement]: + assert dtype == torch.int8 + a, b = make_rand_tensors(torch.int8, m, n, k) + scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) + scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) + + timers = [] + # pytorch impl + timers.append( + bench_fn(a.to(dtype=torch.bfloat16, device="cuda"), + b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b, + torch.bfloat16, label, sub_label, pytorch_i8_impl, + "pytorch_bf16_bf16_bf16_matmul-no-scales")) + + # cutlass impl + timers.append( + bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"), + torch.bfloat16, label, sub_label, cutlass_impl, + "cutlass_i8_i8_bf16_scaled_mm")) + + return timers + + +def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, + sub_label: str) -> Iterable[TMeasurement]: + assert dtype == torch.float8_e4m3fn + a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k) + scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) + scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) + + timers = [] + + # pytorch impl: bf16 output, without fp8 fast accum + timers.append( + bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label, + pytorch_fp8_impl, "pytorch_fp8_fp8_bf16_scaled_mm")) + + # pytorch impl: bf16 output, with fp8 fast accum + timers.append( + bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label, + pytorch_fp8_impl_fast_accum, + "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum")) + + # pytorch impl: fp16 output, without fp8 fast accum + timers.append( + bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label, + pytorch_fp8_impl, "pytorch_fp8_fp8_fp16_scaled_mm")) + + # pytorch impl: fp16 output, with fp8 fast accum + timers.append( + bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label, + pytorch_fp8_impl_fast_accum, + "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum")) + + # cutlass impl: bf16 output + timers.append( + bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"), + torch.bfloat16, label, sub_label, cutlass_impl, + "cutlass_fp8_fp8_bf16_scaled_mm")) + # cutlass impl: fp16 output + timers.append( + bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"), + torch.float16, label, sub_label, cutlass_impl, + "cutlass_fp8_fp8_fp16_scaled_mm")) + return timers + + +def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str, + sub_label: str) -> Iterable[TMeasurement]: + if dtype == torch.int8: + return bench_int8(dtype, m, k, n, label, sub_label) + if dtype == torch.float8_e4m3fn: + return bench_fp8(dtype, m, k, n, label, sub_label) + raise ValueError("unsupported type") + + +# runner +def print_timers(timers: Iterable[TMeasurement]): + compare = TBenchmark.Compare(timers) + compare.print() + + +def run(dtype: torch.dtype, + MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: + + results = [] + for m, k, n in MKNs: + timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm", + f"MKN=({m}x{k}x{n})") + print_timers(timers) + results.extend(timers) + + return results + + +# output makers +def make_output(data: Iterable[TMeasurement], + MKNs: Iterable[Tuple[int, int, int]], + base_description: str, + timestamp=None): + + print(f"== All Results {base_description} ====") + print_timers(data) + + # pickle all the results + timestamp = int(time.time()) if timestamp is None else timestamp + with open(f"{base_description}-{timestamp}.pkl", "wb") as f: + pkl.dump(data, f) + + +# argparse runners + + +def run_square_bench(args): + dim_sizes = list( + range(args.dim_start, args.dim_end + 1, args.dim_increment)) + MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes)) + data = run(args.dtype, MKNs) + + make_output(data, MKNs, f"square_bench-{args.dtype}") + + +def run_range_bench(args): + dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment)) + n = len(dim_sizes) + Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes + Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes + Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes + MKNs = list(zip(Ms, Ks, Ns)) + data = run(args.dtype, MKNs) + + make_output(data, MKNs, f"range_bench-{args.dtype}") + + +def run_model_bench(args): + + print("Benchmarking models:") + for i, model in enumerate(args.models): + print(f"[{i}] {model}") + + def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]: + KNs = [] + for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]): + KN[tp_split_dim] = KN[tp_split_dim] // tp_size + KNs.append(KN) + return KNs + + model_bench_data = [] + models_tps = list(itertools.product(args.models, args.tp_sizes)) + for model, tp_size in models_tps: + Ms = args.batch_sizes + KNs = model_shapes(model, tp_size) + MKNs = [] + for m in Ms: + for k, n in KNs: + MKNs.append((m, k, n)) + + data = run(args.dtype, MKNs) + model_bench_data.append(data) + + # Print all results + for data, model_tp in zip(model_bench_data, models_tps): + model, tp_size = model_tp + print(f"== Results {args.dtype} {model}-TP{tp_size} ====") + print_timers(data) + + timestamp = int(time.time()) + + all_data = [] + for d in model_bench_data: + all_data.extend(d) + # pickle all data + with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f: + pkl.dump(all_data, f) + + +if __name__ == '__main__': + + def to_torch_dtype(dt): + if dt == "int8": + return torch.int8 + if dt == "fp8": + return torch.float8_e4m3fn + raise ValueError("unsupported dtype") + + parser = argparse.ArgumentParser( + description=""" +Benchmark Cutlass GEMM. + + To run square GEMMs: + python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64 + + To run constant N and K and sweep M: + python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384 + + To run dimensions from a model: + python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1 + + Output: + - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs. + """, # noqa: E501 + formatter_class=argparse.RawTextHelpFormatter) + + parser.add_argument("--dtype", + type=to_torch_dtype, + required=True, + help="Available options are ['int8', 'fp8']") + subparsers = parser.add_subparsers(dest="cmd") + + square_parser = subparsers.add_parser("square_bench") + square_parser.add_argument("--dim-start", type=int, required=True) + square_parser.add_argument("--dim-end", type=int, required=True) + square_parser.add_argument("--dim-increment", type=int, required=True) + square_parser.set_defaults(func=run_square_bench) + + range_parser = subparsers.add_parser("range_bench") + range_parser.add_argument("--dim-start", type=int, required=True) + range_parser.add_argument("--dim-end", type=int, required=True) + range_parser.add_argument("--dim-increment", type=int, required=True) + range_parser.add_argument("--m-constant", type=int, default=None) + range_parser.add_argument("--n-constant", type=int, default=None) + range_parser.add_argument("--k-constant", type=int, default=None) + range_parser.set_defaults(func=run_range_bench) + + model_parser = subparsers.add_parser("model_bench") + model_parser.add_argument("--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES.keys()) + model_parser.add_argument("--tp-sizes", + nargs="+", + type=int, + default=DEFAULT_TP_SIZES) + model_parser.add_argument("--batch-sizes", + nargs="+", + type=int, + default=DEFAULT_BATCH_SIZES) + model_parser.set_defaults(func=run_model_bench) + + args = parser.parse_args() + args.func(args) diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py new file mode 100644 index 0000000000000..7ad4a53d376b6 --- /dev/null +++ b/benchmarks/cutlass_benchmarks/weight_shapes.py @@ -0,0 +1,37 @@ +# Weight Shapes are in the format +# ([K, N], TP_SPLIT_DIM) +# Example: +# A shape of ([14336, 4096], 0) indicates the following GEMM shape, +# - TP1 : K = 14336, N = 4096 +# - TP2 : K = 7168, N = 4096 +# A shape of ([4096, 6144], 1) indicates the following GEMM shape, +# - TP1 : K = 4096, N = 6144 +# - TP4 : K = 4096, N = 1536 + +# TP1 shapes +WEIGHT_SHAPES = { + "mistralai/Mistral-7B-v0.1": [ + ([4096, 6144], 1), + ([4096, 4096], 0), + ([4096, 28672], 1), + ([14336, 4096], 0), + ], + "meta-llama/Llama-2-7b-hf": [ + ([4096, 12288], 1), + ([4096, 4096], 0), + ([4096, 22016], 1), + ([11008, 4096], 0), + ], + "meta-llama/Llama-2-13b-hf": [ + ([5120, 15360], 1), + ([5120, 5120], 0), + ([5120, 27648], 1), + ([13824, 5120], 0), + ], + "meta-llama/Llama-2-70b-hf": [ + ([8192, 10240], 1), + ([8192, 8192], 0), + ([8192, 57344], 1), + ([28672, 8192], 0), + ], +} diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu index 2383760abcdb0..4c1aec03a3caa 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu @@ -51,6 +51,11 @@ using namespace cute; namespace { +uint32_t next_pow_2(uint32_t const num) { + if (num <= 1) return num; + return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); +} + template @@ -188,8 +193,89 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a, cutlass::Status status = gemm_op.run(args, workspace.get(), stream); CUTLASS_CHECK(status); } + +template +struct sm90_fp8_config { + static_assert(std::is_same()); + using KernelSchedule = + cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum; + using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; + using TileShape = Shape<_128, _128, _128>; + using ClusterShape = Shape<_2, _1, _1>; + + using Cutlass3xGemm = + cutlass_3x_gemm; +}; + +template +struct sm90_fp8_config { + static_assert(std::is_same()); + using KernelSchedule = + cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum; + using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; + using TileShape = Shape<_64, _128, _128>; + using ClusterShape = Shape<_2, _1, _1>; + + using Cutlass3xGemm = + cutlass_3x_gemm; +}; + +template +struct sm90_fp8_config { + static_assert(std::is_same()); + using KernelSchedule = + cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum; + using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; + using TileShape = Shape<_64, _64, _128>; + using ClusterShape = Shape<_1, _8, _1>; + + using Cutlass3xGemm = + cutlass_3x_gemm; +}; + } // namespace +template +void cutlass_scaled_mm_dq_sm90_fp8_dispatch(torch::Tensor& out, + torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { + static_assert(std::is_same()); + TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn); + TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn); + TORCH_CHECK(a_scales.dtype() == torch::kFloat32); + TORCH_CHECK(b_scales.dtype() == torch::kFloat32); + + using Cutlass3xGemmDefault = + typename sm90_fp8_config::Cutlass3xGemm; + using Cutlass3xGemmM64 = + typename sm90_fp8_config::Cutlass3xGemm; + using Cutlass3xGemmM128 = + typename sm90_fp8_config::Cutlass3xGemm; + + uint32_t const m = a.size(0); + uint32_t const mp2 = + std::max(static_cast(64), next_pow_2(m)); // next power of 2 + + if (mp2 <= 64) { + // m in [1, 64] + return cutlass_scaled_mm_dq_dispatcher( + out, a, b, a_scales, b_scales); + } else if (mp2 <= 128) { + // m in (64, 128] + return cutlass_scaled_mm_dq_dispatcher( + out, a, b, a_scales, b_scales); + } else { + // m in (128, inf) + return cutlass_scaled_mm_dq_dispatcher( + out, a, b, a_scales, b_scales); + } +} + void cutlass_scaled_mm_dq_sm90(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, @@ -223,24 +309,14 @@ void cutlass_scaled_mm_dq_sm90(torch::Tensor& out, torch::Tensor const& a, TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn); TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn); - using TileShape = Shape<_128, _128, _128>; - using ClusterShape = Shape<_1, _2, _1>; - using KernelSchedule = - typename cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative; - using EpilogueSchedule = - typename cutlass::epilogue::TmaWarpSpecializedCooperative; - if (out.dtype() == torch::kBFloat16) { - return cutlass_scaled_mm_dq_dispatcher< - cutlass_3x_gemm>( + return cutlass_scaled_mm_dq_sm90_fp8_dispatch( out, a, b, a_scales, b_scales); } else { TORCH_CHECK(out.dtype() == torch::kFloat16); - - return cutlass_scaled_mm_dq_dispatcher< - cutlass_3x_gemm>( + return cutlass_scaled_mm_dq_sm90_fp8_dispatch( out, a, b, a_scales, b_scales); } } diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index 5a18dd5c1e3b3..079d9650c7af5 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -82,7 +82,7 @@ def cutlass_int8_gemm_helper(m: int, assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0) -@pytest.mark.parametrize("m", [512, 222, 33, 1]) +@pytest.mark.parametrize("m", [512, 222, 100, 33, 1]) @pytest.mark.parametrize("n", [2048, 256, 1024]) @pytest.mark.parametrize("k", [128, 496, 1024]) @pytest.mark.parametrize("per_act_token", [True, False]) From c35407282878cb3a42860d584a4d9eb6aed82299 Mon Sep 17 00:00:00 2001 From: Ye Cao <952129620@qq.com> Date: Sun, 2 Jun 2024 01:11:22 +0800 Subject: [PATCH 12/19] [Minor] Fix the path typo in loader.py: save_sharded_states.py -> save_sharded_state.py (#5151) Signed-off-by: Ye Cao --- vllm/model_executor/model_loader/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index b7b5b5e7695f4..e20da0e15fb93 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -386,7 +386,7 @@ class ShardedStateLoader(BaseModelLoader): Model loader that directly loads each worker's model state dict, which enables a fast load path for large tensor-parallel models where each worker only needs to read its own shard rather than the entire checkpoint. See - `examples/save_sharded_states.py` for creating a sharded checkpoint. + `examples/save_sharded_state.py` for creating a sharded checkpoint. """ DEFAULT_PATTERN = "model-rank-{rank}-part-{part}.safetensors" From 37464a0f745a0204da7443d2a6ef4b8f65e5af12 Mon Sep 17 00:00:00 2001 From: Nadav Shmayovits <45605409+NadavShmayo@users.noreply.github.com> Date: Sat, 1 Jun 2024 20:18:50 +0300 Subject: [PATCH 13/19] [Bugfix] Fix call to init_logger in openai server (#4765) --- vllm/entrypoints/openai/api_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 97b35262329ee..95417718b51fe 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -36,7 +36,7 @@ openai_serving_completion: OpenAIServingCompletion openai_serving_embedding: OpenAIServingEmbedding -logger = init_logger(__name__) +logger = init_logger('vllm.entrypoints.openai.api_server') _running_tasks: Set[asyncio.Task] = set() From b9c0605a8e7d558f595bd59ba6e6c95578dc0f1e Mon Sep 17 00:00:00 2001 From: chenqianfzh <51831990+chenqianfzh@users.noreply.github.com> Date: Sat, 1 Jun 2024 13:51:10 -0700 Subject: [PATCH 14/19] [Feature][Kernel] Support bitsandbytes quantization and QLoRA (#4776) --- examples/lora_with_quantization_inference.py | 140 ++++++++++ requirements-dev.txt | 3 + tests/quantization/test_bitsandbytes.py | 80 ++++++ vllm/config.py | 9 +- vllm/engine/arg_utils.py | 38 ++- vllm/model_executor/layers/linear.py | 41 ++- .../layers/quantization/__init__.py | 3 + .../layers/quantization/bitsandbytes.py | 175 +++++++++++++ vllm/model_executor/model_loader/loader.py | 247 +++++++++++++++++- .../model_loader/weight_utils.py | 16 +- vllm/model_executor/models/llama.py | 8 + 11 files changed, 752 insertions(+), 8 deletions(-) create mode 100644 examples/lora_with_quantization_inference.py create mode 100644 tests/quantization/test_bitsandbytes.py create mode 100644 vllm/model_executor/layers/quantization/bitsandbytes.py diff --git a/examples/lora_with_quantization_inference.py b/examples/lora_with_quantization_inference.py new file mode 100644 index 0000000000000..3b2347c1115e1 --- /dev/null +++ b/examples/lora_with_quantization_inference.py @@ -0,0 +1,140 @@ +""" +This example shows how to use LoRA with different quantization techniques +for offline inference. + +Requires HuggingFace credentials for access. +""" + +import gc +from typing import List, Optional, Tuple + +import torch +from huggingface_hub import snapshot_download + +from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams +from vllm.lora.request import LoRARequest + + +def create_test_prompts( + lora_path: str +) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]: + return [ + # this is an example of using quantization without LoRA + ("My name is", + SamplingParams(temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128), None), + # the next three examples use quantization with LoRA + ("my name is", + SamplingParams(temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128), + LoRARequest("lora-test-1", 1, lora_path)), + ("The capital of USA is", + SamplingParams(temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128), + LoRARequest("lora-test-2", 1, lora_path)), + ("The capital of France is", + SamplingParams(temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128), + LoRARequest("lora-test-3", 1, lora_path)), + ] + + +def process_requests(engine: LLMEngine, + test_prompts: List[Tuple[str, SamplingParams, + Optional[LoRARequest]]]): + """Continuously process a list of prompts and handle the outputs.""" + request_id = 0 + + while test_prompts or engine.has_unfinished_requests(): + if test_prompts: + prompt, sampling_params, lora_request = test_prompts.pop(0) + engine.add_request(str(request_id), + prompt, + sampling_params, + lora_request=lora_request) + request_id += 1 + + request_outputs: List[RequestOutput] = engine.step() + for request_output in request_outputs: + if request_output.finished: + print("----------------------------------------------------") + print(f"Prompt: {request_output.prompt}") + print(f"Output: {request_output.outputs[0].text}") + + +def initialize_engine(model: str, quantization: str, + lora_repo: Optional[str]) -> LLMEngine: + """Initialize the LLMEngine.""" + + if quantization == "bitsandbytes": + # QLoRA (https://arxiv.org/abs/2305.14314) is a quantization technique. + # It quantizes the model when loading, with some config info from the + # LoRA adapter repo. So need to set the parameter of load_format and + # qlora_adapter_name_or_path as below. + engine_args = EngineArgs( + model=model, + quantization=quantization, + qlora_adapter_name_or_path=lora_repo, + load_format="bitsandbytes", + enable_lora=True, + max_lora_rank=64, + # set it only in GPUs of limited memory + enforce_eager=True) + else: + engine_args = EngineArgs( + model=model, + quantization=quantization, + enable_lora=True, + max_loras=4, + # set it only in GPUs of limited memory + enforce_eager=True) + return LLMEngine.from_engine_args(engine_args) + + +def main(): + """Main function that sets up and runs the prompt processing.""" + + test_configs = [{ + "name": "qlora_inference_example", + 'model': "huggyllama/llama-7b", + 'quantization': "bitsandbytes", + 'lora_repo': 'timdettmers/qlora-flan-7b' + }, { + "name": "AWQ_inference_with_lora_example", + 'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ', + 'quantization': "awq", + 'lora_repo': 'jashing/tinyllama-colorist-lora' + }, { + "name": "GPTQ_inference_with_lora_example", + 'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ', + 'quantization': "gptq", + 'lora_repo': 'jashing/tinyllama-colorist-lora' + }] + + for test_config in test_configs: + print( + f"~~~~~~~~~~~~~~~~ Running: {test_config['name']} ~~~~~~~~~~~~~~~~" + ) + engine = initialize_engine(test_config['model'], + test_config['quantization'], + test_config['lora_repo']) + lora_path = snapshot_download(repo_id=test_config['lora_repo']) + test_prompts = create_test_prompts(lora_path) + process_requests(engine, test_prompts) + + # Clean up the GPU memory for the next test + del engine + gc.collect() + torch.cuda.empty_cache() + + +if __name__ == '__main__': + main() diff --git a/requirements-dev.txt b/requirements-dev.txt index cf2bb9bef22d9..2c6b33ea813a2 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -35,3 +35,6 @@ aiohttp # Multimodal pillow + +# quantization +bitsandbytes==0.42.0 diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py new file mode 100644 index 0000000000000..4e9feb3c48148 --- /dev/null +++ b/tests/quantization/test_bitsandbytes.py @@ -0,0 +1,80 @@ +'''Tests whether bitsandbytes computation is enabled correctly. + +Run `pytest tests/quantization/test_bitsandbytes.py`. +''' +import pytest +import torch + +from vllm import SamplingParams +from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS + +capability = torch.cuda.get_device_capability() +capability = capability[0] * 10 + capability[1] + + +@pytest.mark.skipif( + capability < QUANTIZATION_METHODS['bitsandbytes'].get_min_capability(), + reason='bitsandbytes is not supported on this GPU type.') +def test_load_bnb_model(vllm_runner) -> None: + llm = vllm_runner('huggyllama/llama-7b', + quantization='bitsandbytes', + load_format='bitsandbytes', + enforce_eager=True) + + model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model + + # check the weights in MLP & SelfAttention are quantized to torch.uint8 + qweight = model.model.layers[0].mlp.gate_up_proj.qweight + assert qweight.dtype == torch.uint8, ( + f'Expected gate_up_proj dtype torch.uint8 but got {qweight.dtype}') + + qweight = model.model.layers[0].mlp.down_proj.qweight + assert qweight.dtype == torch.uint8, ( + f'Expected down_proj dtype torch.uint8 but got {qweight.dtype}') + + qweight = model.model.layers[0].self_attn.o_proj.qweight + assert qweight.dtype == torch.uint8, ( + f'Expected o_proj dtype torch.uint8 but got {qweight.dtype}') + + qweight = model.model.layers[0].self_attn.qkv_proj.qweight + assert qweight.dtype == torch.uint8, ( + f'Expected qkv_proj dtype torch.uint8 but got {qweight.dtype}') + + # some weights should not be quantized + weight = model.lm_head.weight + assert weight.dtype != torch.uint8, ( + 'lm_head weight dtype should not be torch.uint8') + + weight = model.model.embed_tokens.weight + assert weight.dtype != torch.uint8, ( + 'embed_tokens weight dtype should not be torch.uint8') + + weight = model.model.layers[0].input_layernorm.weight + assert weight.dtype != torch.uint8, ( + 'input_layernorm weight dtype should not be torch.uint8') + + weight = model.model.layers[0].post_attention_layernorm.weight + assert weight.dtype != torch.uint8, ( + 'input_layernorm weight dtype should not be torch.uint8') + + # check the output of the model is expected + sampling_params = SamplingParams(temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=8) + + prompts = ['That which does not kill us', 'To be or not to be,'] + expected_outputs = [ + 'That which does not kill us makes us stronger.', + 'To be or not to be, that is the question.' + ] + outputs = llm.generate(prompts, sampling_params=sampling_params) + + assert len(outputs) == len(prompts) + + for index in range(len(outputs)): + # compare the first line of the output + actual_output = outputs[index][1][0].split('\n', 1)[0] + expected_output = expected_outputs[index].split('\n', 1)[0] + assert actual_output == expected_output, ( + f'Expected: {expected_output}, but got: {actual_output}') diff --git a/vllm/config.py b/vllm/config.py index 4d05b4ea36d5c..ba4361ffb98b4 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -241,6 +241,12 @@ def verify_with_parallel_config( "must be divisible by pipeline parallel size " f"({pipeline_parallel_size}).") + if self.quantization == "bitsandbytes" and ( + parallel_config.tensor_parallel_size > 1 + or parallel_config.pipeline_parallel_size > 1): + raise ValueError( + "BitAndBytes quantization with TP or PP is not supported yet.") + def get_hf_config_sliding_window(self) -> Optional[int]: """Get the sliding window size, or None if disabled. """ @@ -327,7 +333,7 @@ def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int: def get_num_attention_heads(self, parallel_config: "ParallelConfig") -> int: return self.hf_text_config.num_attention_heads // \ - parallel_config.tensor_parallel_size + parallel_config.tensor_parallel_size def get_num_layers(self, parallel_config: "ParallelConfig") -> int: total_num_hidden_layers = self.hf_text_config.num_hidden_layers @@ -487,6 +493,7 @@ class LoadFormat(str, enum.Enum): DUMMY = "dummy" TENSORIZER = "tensorizer" SHARDED_STATE = "sharded_state" + BITSANDBYTES = "bitsandbytes" @dataclass diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 11485aa2438c0..8a73fc931a95a 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -92,6 +92,8 @@ class EngineArgs: ngram_prompt_lookup_max: Optional[int] = None ngram_prompt_lookup_min: Optional[int] = None + qlora_adapter_name_or_path: Optional[str] = None + def __post_init__(self): if self.tokenizer is None: self.tokenizer = self.model @@ -159,7 +161,8 @@ def add_cli_args( type=str, default=EngineArgs.load_format, choices=[ - 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer' + 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', + 'bitsandbytes' ], help='The format of the model weights to load.\n\n' '* "auto" will try to load the weights in the safetensors format ' @@ -173,7 +176,9 @@ def add_cli_args( 'which is mainly for profiling.\n' '* "tensorizer" will load the weights using tensorizer from ' 'CoreWeave. See the Tensorize vLLM Model script in the Examples' - 'section for more information.\n') + 'section for more information.\n' + '* "bitsandbytes" will load the weights using bitsandbytes ' + 'quantization.\n') parser.add_argument( '--dtype', type=str, @@ -543,7 +548,10 @@ def add_cli_args( "will also be used in `model_name` tag content of " "prometheus metrics, if multiple names provided, metrics" "tag will take the first one.") - + parser.add_argument('--qlora-adapter-name-or-path', + type=str, + default=None, + help='Name or path of the QLoRA adapter.') return parser @classmethod @@ -555,6 +563,23 @@ def from_cli_args(cls, args: argparse.Namespace): return engine_args def create_engine_config(self, ) -> EngineConfig: + + # bitsandbytes quantization needs a specific model loader + # so we make sure the quant method and the load format are consistent + if (self.quantization == "bitsandbytes" or + self.qlora_adapter_name_or_path is not None) and \ + self.load_format != "bitsandbytes": + raise ValueError( + "BitsAndBytes quantization and QLoRA adapter only support " + f"'bitsandbytes' load format, but got {self.load_format}") + + if (self.load_format == "bitsandbytes" or + self.qlora_adapter_name_or_path is not None) and \ + self.quantization != "bitsandbytes": + raise ValueError( + "BitsAndBytes load format and QLoRA adapter only support " + f"'bitsandbytes' quantization, but got {self.quantization}") + device_config = DeviceConfig(self.device) model_config = ModelConfig( self.model, self.tokenizer, self.tokenizer_mode, @@ -622,6 +647,13 @@ def create_engine_config(self, ) -> EngineConfig: max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras and self.max_cpu_loras > 0 else None) if self.enable_lora else None + if self.qlora_adapter_name_or_path is not None and \ + self.qlora_adapter_name_or_path != "": + if self.model_loader_extra_config is None: + self.model_loader_extra_config = {} + self.model_loader_extra_config[ + "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path + load_config = LoadConfig( load_format=self.load_format, download_dir=self.download_dir, diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 34fbfa8e33ef9..f5b6bdd9f7fd7 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -1,5 +1,5 @@ from abc import abstractmethod -from typing import List, Optional +from typing import Dict, List, Optional, Tuple import torch import torch.nn.functional as F @@ -26,6 +26,21 @@ def adjust_marlin_shard(param, shard_size, shard_offset): return shard_size * marlin_tile_size, shard_offset * marlin_tile_size +def adjust_bitsandbytes_shard(param: Parameter, + qkv_offsets: Dict[str, Tuple[int, int]], + loaded_shard_id: str) -> Tuple[int, int]: + """Adjust the quantization offsets and sizes for BitsAndBytes sharding.""" + + total, _ = qkv_offsets["total"] + orig_offset, orig_size = qkv_offsets[loaded_shard_id] + + quantized_total = param.data.shape[0] + quantized_offset = orig_offset * quantized_total // total + quantized_size = orig_size * quantized_total // total + + return quantized_size, quantized_offset + + class LinearMethodBase(QuantizeMethodBase): """Base class for different (maybe quantized) linear methods.""" @@ -37,7 +52,7 @@ def create_weights(self, layer: torch.nn.Module, **extra_weight_attrs): """Create weights for a linear layer. The weights will be set as attributes of the layer. - + Args: layer: The layer that is using the LinearMethodBase factory. input_size_per_partition: Size of the weight input dim on rank X. @@ -416,6 +431,12 @@ def weight_loader(self, shard_size, shard_offset = adjust_marlin_shard( param, shard_size, shard_offset) + use_bitsandbytes = getattr(param, "use_bitsandbytes", False) + if use_bitsandbytes: + shard_size = loaded_weight.shape[output_dim] + shard_offset = loaded_weight.shape[output_dim] * \ + loaded_shard_id + param_data = param_data.narrow(output_dim, shard_offset, shard_size) start_idx = tp_rank * shard_size @@ -615,6 +636,22 @@ def weight_loader(self, shard_size, shard_offset = adjust_marlin_shard( param, shard_size, shard_offset) + use_bitsandbytes = getattr(param, "use_bitsandbytes", False) + if use_bitsandbytes: + orig_qkv_offsets = { + "q": (0, self.num_heads * self.head_size), + "k": (self.num_heads * self.head_size, + self.num_kv_heads * self.head_size), + "v": + ((self.num_heads + self.num_kv_heads) * self.head_size, + self.num_kv_heads * self.head_size), + "total": + ((self.num_heads + 2 * self.num_kv_heads) * self.head_size, + 0) + } + shard_size, shard_offset = adjust_bitsandbytes_shard( + param, orig_qkv_offsets, loaded_shard_id) + param_data = param_data.narrow(output_dim, shard_offset, shard_size) if loaded_shard_id == "q": diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index 7b9abe1b629a1..0bc42beb66257 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -4,6 +4,8 @@ from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.quantization.bitsandbytes import ( + BitsAndBytesConfig) from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 CompressedTensorsConfig) from vllm.model_executor.layers.quantization.deepspeedfp import ( @@ -30,6 +32,7 @@ "gptq": GPTQConfig, "squeezellm": SqueezeLLMConfig, "sparseml": CompressedTensorsConfig, + "bitsandbytes": BitsAndBytesConfig, } diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py new file mode 100644 index 0000000000000..969958d9b5448 --- /dev/null +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -0,0 +1,175 @@ +from typing import Any, Dict, List, Optional + +import torch +from torch.nn.parameter import Parameter + +from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, + set_weight_attrs) +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) + + +class BitsAndBytesConfig(QuantizationConfig): + """Config class for BitsAndBytes Quantization. + + Reference: https://arxiv.org/abs/2305.14314 + """ + + def __init__( + self, + adapter_name_or_path: str, + target_modules: List[str], + ) -> None: + + self.adapter_name_or_path = adapter_name_or_path + self.target_modules = target_modules + + def __repr__(self) -> str: + return ( + f"BitsAndBytesConfig(adapter_name_or_path={self.adapter_name_or_path}" + ) + + @classmethod + def get_name(self) -> str: + return "bitsandbytes" + + @classmethod + def get_supported_act_dtypes(self) -> List[torch.dtype]: + return [torch.float32, torch.float16, torch.bfloat16] + + @classmethod + def get_min_capability(self) -> int: + return 70 + + @staticmethod + def get_config_filenames() -> List[str]: + return [ + "adapter_config.json", + ] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "BitsAndBytesConfig": + adapter_name = cls.get_from_keys(config, ["adapter_name_or_path"]) + default_target_modules = [ + "gate_proj", "down_proj", "up_proj", "q_proj", "k_proj", "v_proj", + "o_proj" + ] + if adapter_name == "": + target_modules = default_target_modules + else: + target_modules = cls.get_from_keys(config, ["target_modules"]) + return cls(adapter_name, target_modules) + + def get_quant_method( + self, + layer: torch.nn.Module) -> Optional["BitsAndBytesLinearMethod"]: + if isinstance(layer, LinearBase): + return BitsAndBytesLinearMethod(self) + return None + + def get_scaled_act_names(self) -> List[str]: + return ["gelu", "gelu_fast", "gelu_new", "gelu_pytorch_tanh"] + + +class BitsAndBytesLinearMethod(LinearMethodBase): + """Linear method for BitsAndBytes. + + Args: + quant_config: The BitsAndBytes quantization config. + """ + + def __init__(self, quant_config: BitsAndBytesConfig): + try: + import bitsandbytes + if bitsandbytes.__version__ < "0.42.0": + raise ImportError("bitsandbytes version is wrong. Please " + "install bitsandbytes>=0.42.0.") + except ImportError as err: + raise ImportError("Please install bitsandbytes>=0.42.0 via " + "`pip install bitsandbytes>=0.42.0` to use " + "bitsandbytes quantizer.") from err + + self.quant_config = quant_config + + def create_weights(self, layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], input_size: int, + output_size: int, params_dtype: torch.dtype, + **extra_weight_attrs): + quant_ratio = 0 + if params_dtype.is_floating_point: + quant_ratio = torch.finfo(params_dtype).bits // torch.iinfo( + torch.uint8).bits + else: + quant_ratio = torch.iinfo(params_dtype).bits // torch.iinfo( + torch.uint8).bits + + if input_size_per_partition * sum( + output_partition_sizes) % quant_ratio != 0: + raise ValueError( + "The input size is not aligned with the quantized " + "weight shape. ") + qweight = Parameter( + torch.empty( + input_size_per_partition * sum(output_partition_sizes) // + quant_ratio, + 1, + dtype=torch.uint8, + ), + requires_grad=False, + ) + + set_weight_attrs( + qweight, + { + "input_dim": 0, + # In bitsandbytes, a tensor of shape [n,m] is quantized to + #[n*m/pack_ratio, 1],so the output_dim is 0 + "output_dim": 0, + "pack_factor": quant_ratio, + "use_bitsandbytes": True, + }) + layer.register_parameter("qweight", qweight) + set_weight_attrs(qweight, extra_weight_attrs) + + def apply(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + + # only load the bitsandbytes module when needed + from bitsandbytes import matmul_4bit + + original_type = x.dtype + bf_x = x.to(torch.bfloat16) + + qweight = layer.qweight + quant_states = qweight.bnb_quant_state + offsets = qweight.bnb_shard_offsets + + out_dim_0 = x.shape[0] + out_dim_1 = sum( + [quant_state[1].shape[0] for quant_state in quant_states.items()]) + out = torch.empty(out_dim_0, + out_dim_1, + dtype=torch.bfloat16, + device=x.device) + + current_index = 0 + for i in range(len(quant_states)): + output_size = quant_states[i].shape[0] + # It is more efficient to use out kwarg like + # matmul_4bit(..., out = ...). Infeasible now due to the bug + # https://github.com/TimDettmers/bitsandbytes/issues/1235. + # Need to change after the bug is fixed. + out[:, current_index:current_index + output_size] = matmul_4bit( + bf_x, qweight[offsets[i]:offsets[i + 1]].t(), quant_states[i]) + + current_index += output_size + + out = out.to(original_type) + + if bias is not None: + out += bias + + return out diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index e20da0e15fb93..9c2eaee2eda55 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -1,13 +1,18 @@ # ruff: noqa: SIM117 import collections import copy +import fnmatch import glob +import json +import math import os from abc import ABC, abstractmethod from typing import Any, Dict, Generator, List, Optional, Tuple, Type import huggingface_hub +import numpy as np import torch +from huggingface_hub import HfApi, hf_hub_download from torch import nn from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoadFormat, @@ -28,6 +33,7 @@ get_quant_config, initialize_dummy_weights, np_cache_weights_iterator, pt_weights_iterator, safetensors_weights_iterator) from vllm.model_executor.models.vlm_base import VisionLanguageModelBase +from vllm.model_executor.utils import set_weight_attrs logger = init_logger(__name__) @@ -125,7 +131,7 @@ def __init__(self, load_config: LoadConfig): def _maybe_download_from_modelscope( self, model: str, revision: Optional[str]) -> Optional[str]: """Download model from ModelScope hub if VLLM_USE_MODELSCOPE is True. - + Returns the path to the downloaded model, or None if the model is not downloaded from ModelScope.""" if VLLM_USE_MODELSCOPE: @@ -247,6 +253,7 @@ def load_model(self, *, model_config: ModelConfig, model, "fall_back_to_pt_during_load", True)), ) + for _, module in model.named_modules(): quant_method = getattr(module, "quant_method", None) if quant_method is not None: @@ -539,6 +546,241 @@ def save_model( ) +class BitsAndBytesModelLoader(BaseModelLoader): + """Model loader to load model weights with BitAndBytes quantization.""" + + default_target_modules = [ + "gate_proj", "down_proj", "up_proj", "q_proj", "k_proj", "v_proj", + "o_proj" + ] + + possible_config_file_names = ["adapter_config.json"] + + def __init__(self, load_config: LoadConfig): + super().__init__(load_config) + + # we don't need to quantize the whole model, only the target modules + # that are specified in the adapter config file. If the adapter config + # file is not provided, we will quantize the default modules. + if (not load_config.model_loader_extra_config + or "qlora_adapter_name_or_path" + not in load_config.model_loader_extra_config): + self.target_modules = self.default_target_modules + return + + qlora_adapter = load_config.model_loader_extra_config[ + "qlora_adapter_name_or_path"] + + config_file_path = self._get_config_file(qlora_adapter) + + with open(config_file_path, "r") as f: + config = json.load(f) + self.target_modules = config["target_modules"] + + def _get_config_file(self, qlora_adapter: str) -> str: + is_local = os.path.isdir(qlora_adapter) + config_file_path = None + if is_local: + for file in self.possible_config_file_names: + config_file_path = os.path.join(qlora_adapter, file) + if os.path.exists(config_file_path): + break + else: + hf_api = HfApi() + repo_files = hf_api.list_repo_files(repo_id=qlora_adapter) + for file in self.possible_config_file_names: + if file in repo_files: + config_file_path = hf_hub_download(repo_id=qlora_adapter, + filename=file) + break + + if not config_file_path: + raise ValueError( + f"Cannot find adapter config file in {qlora_adapter}") + + return config_file_path + + def _get_weight_files( + self, + model_name_or_path: str, + allowed_patterns: List[str], + revision: Optional[str] = None) -> Tuple[List[str], str]: + """Retrieve weight files. Download the files if necessary. + + Return the weight files and the file pattern.""" + is_local = os.path.isdir(model_name_or_path) + + if is_local: + for pattern in allowed_patterns: + weight_files = glob.glob( + os.path.join(model_name_or_path, pattern)) + if weight_files: + return weight_files, pattern + else: + hf_api = HfApi() + repo_files = hf_api.list_repo_files(repo_id=model_name_or_path) + for pattern in allowed_patterns: + matching_files = fnmatch.filter(repo_files, pattern) + if matching_files: + hf_folder = download_weights_from_hf( + model_name_or_path, self.load_config.download_dir, + [pattern], revision) + return glob.glob(os.path.join(hf_folder, pattern)), pattern + + raise RuntimeError( + f"No model weights found in: `{model_name_or_path}`") + + def _prepare_weights(self, model_name_or_path: str, + revision: Optional[str]) -> Tuple[List[str], bool]: + """Prepare weight files for the model.""" + + allowed_patterns = ["*.safetensors", "*.bin", "*.pt"] + + hf_weights_files, matched_pattern = self._get_weight_files( + model_name_or_path, allowed_patterns, revision) + + if matched_pattern != "*.safetensors": + hf_weights_files = filter_files_not_needed_for_inference( + hf_weights_files) + + if len(hf_weights_files) == 0: + raise RuntimeError( + f"Cannot find any model weights with `{model_name_or_path}`") + + return hf_weights_files, matched_pattern == "*.safetensors" + + def _get_quantized_weights_iterator( + self, model_name_or_path: str, revision: Optional[str] + ) -> Tuple[Generator[Tuple[str, torch.Tensor], None, None], Dict[str, + Any]]: + """Get an iterator to the model weights with bitsandbytes quantization, + as well as the quantization state dictionary.""" + + # only load the bitsandbytes module when needed + try: + import bitsandbytes + if bitsandbytes.__version__ < "0.42.0": + raise ImportError("bitsandbytes version is wrong. Please " + "install bitsandbytes>=0.42.0.") + from bitsandbytes.functional import quantize_4bit + except ImportError as err: + raise ImportError("Please install bitsandbytes>=0.42.0 via " + "`pip install bitsandbytes>=0.42.0` to use " + "bitsandbytes quantizer.") from err + + hf_weights_files, use_safetensors = self._prepare_weights( + model_name_or_path, revision) + + quant_state_dict = {} + if use_safetensors: + weight_iterator = safetensors_weights_iterator(hf_weights_files) + else: + weight_iterator = pt_weights_iterator(hf_weights_files) + + def generator(): + for weight_name, weight_tensor in weight_iterator: + if any(target_module in weight_name + for target_module in self.target_modules): + weight_name = weight_name.replace(".weight", ".qweight") + # bitsandbytes requires data in GPU + loaded_weight = weight_tensor.cuda().data + with set_default_torch_dtype(torch.float32): + processed_weight, quant_state = quantize_4bit( + loaded_weight, + compress_statistics=True, + quant_type="nf4") + + quant_state_dict[weight_name] = quant_state + else: + processed_weight = weight_tensor + + yield weight_name, processed_weight + + return generator(), quant_state_dict + + def _load_weights(self, model_config: ModelConfig, + model: nn.Module) -> None: + if not hasattr(model, 'load_weights'): + raise AttributeError( + "The required method 'load_weights' is not defined in class" + f" {type(self).__name__}.") + + if not hasattr(model, 'bitsandbytes_stacked_params_mapping'): + raise AttributeError( + f"Model {type(self).__name__} does not support BitsAndBytes " + "quantization yet.") + + logger.info("Loading weights with BitsAndBytes quantization. " + " May take a while ...") + + qweight_iterator, quant_state_dict = ( + self._get_quantized_weights_iterator(model_config.model, + model_config.revision)) + + model.load_weights(qweight_iterator) + + param_dict = dict(model.named_parameters()) + stacked_quant_state_dict: Dict[str, Dict[int, Any]] = {} + for quant_param_name in quant_state_dict: + non_stacked_param_name = quant_param_name + + shard_index = 0 + for shard_name, ( + weight_name, index + ) in model.bitsandbytes_stacked_params_mapping.items(): + if shard_name in quant_param_name: + shard_index = index + quant_param_name = quant_param_name.replace( + shard_name, weight_name) + break + + if quant_param_name not in param_dict: + raise ValueError( + f"Parameter {quant_param_name} not found in the model.") + + if quant_param_name not in stacked_quant_state_dict: + stacked_quant_state_dict[quant_param_name] = {} + + stacked_quant_state_dict[quant_param_name][shard_index] = ( + quant_state_dict[non_stacked_param_name]) + + # save quant_states and offsets as the attributes of the parameters + for param_name, param in param_dict.items(): + if param_name in stacked_quant_state_dict: + quant_states = stacked_quant_state_dict[param_name] + set_weight_attrs(param, {"bnb_quant_state": quant_states}) + + pack_ratio = getattr(param, "pack_factor", -1) + if pack_ratio == -1: + raise ValueError( + f"pack_factor not set for parameter {param_name}.") + + num_elements = [0] * len(quant_states) + for seq, quant_state in enumerate(quant_states.items()): + num_elements[seq] = math.prod( + quant_state[1].shape) // pack_ratio + + offsets = np.concatenate(([0], np.cumsum(num_elements))) + set_weight_attrs(param, {"bnb_shard_offsets": offsets}) + + def load_model(self, *, model_config: ModelConfig, + device_config: DeviceConfig, + lora_config: Optional[LoRAConfig], + vision_language_config: Optional[VisionLanguageConfig], + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + cache_config: CacheConfig) -> nn.Module: + with set_default_torch_dtype(model_config.dtype): + with torch.device(device_config.device): + model = _initialize_model(model_config, self.load_config, + lora_config, vision_language_config, + cache_config) + + self._load_weights(model_config, model) + + return model.eval() + + def get_model_loader(load_config: LoadConfig) -> BaseModelLoader: """Get a model loader based on the load format.""" @@ -554,4 +796,7 @@ def get_model_loader(load_config: LoadConfig) -> BaseModelLoader: if load_config.load_format == LoadFormat.SHARDED_STATE: return ShardedStateLoader(load_config) + if load_config.load_format == LoadFormat.BITSANDBYTES: + return BitsAndBytesModelLoader(load_config) + return DefaultModelLoader(load_config) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 53e21eba8fae3..6174f0a974712 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -130,7 +130,17 @@ def get_quant_config(model_config: ModelConfig, if hf_quant_config is not None: return quant_cls.from_config(hf_quant_config) - model_name_or_path = model_config.model + # In case of bitsandbytes/QLoRA, get quant config from the adapter model. + if model_config.quantization == "bitsandbytes": + if (not load_config.model_loader_extra_config + or "qlora_adapter_name_or_path" + not in load_config.model_loader_extra_config): + return quant_cls.from_config({"adapter_name_or_path": ""}) + model_name_or_path = load_config.model_loader_extra_config[ + "qlora_adapter_name_or_path"] + + else: + model_name_or_path = model_config.model is_local = os.path.isdir(model_name_or_path) if not is_local: # Download the config files. @@ -169,6 +179,10 @@ def get_quant_config(model_config: ModelConfig, quant_config_file = quant_config_files[0] with open(quant_config_file, "r") as f: config = json.load(f) + + if model_config.quantization == "bitsandbytes": + config["adapter_name_or_path"] = model_name_or_path + return quant_cls.from_config(config) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 2ca55f9270fc7..d83ee9a201c0b 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -319,6 +319,14 @@ class LlamaForCausalLM(nn.Module): "lm_head": "output_embeddings", } embedding_padding_modules = ["lm_head"] + bitsandbytes_stacked_params_mapping = { + # shard_name, weight_name, index + "q_proj": ("qkv_proj", 0), + "k_proj": ("qkv_proj", 1), + "v_proj": ("qkv_proj", 2), + "gate_proj": ("gate_up_proj", 0), + "up_proj": ("gate_up_proj", 1), + } def __init__( self, From 8279078e218833b357f7c5076850e3688714d570 Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Sat, 1 Jun 2024 15:40:25 -0700 Subject: [PATCH 15/19] [Bugfix] Remove deprecated @abstractproperty (#5174) --- vllm/core/evictor_v1.py | 5 +++-- vllm/core/evictor_v2.py | 5 +++-- vllm/lora/worker_manager.py | 5 +++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/vllm/core/evictor_v1.py b/vllm/core/evictor_v1.py index aa51dd6938872..5db5a08a5bb67 100644 --- a/vllm/core/evictor_v1.py +++ b/vllm/core/evictor_v1.py @@ -1,5 +1,5 @@ import enum -from abc import ABC, abstractmethod, abstractproperty +from abc import ABC, abstractmethod from typing import OrderedDict from vllm.block import PhysicalTokenBlock @@ -44,7 +44,8 @@ def remove(self, block_hash: int) -> PhysicalTokenBlock: """ pass - @abstractproperty + @property + @abstractmethod def num_blocks(self) -> int: pass diff --git a/vllm/core/evictor_v2.py b/vllm/core/evictor_v2.py index 57759b29347f4..3dd12e2e25131 100644 --- a/vllm/core/evictor_v2.py +++ b/vllm/core/evictor_v2.py @@ -1,5 +1,5 @@ import enum -from abc import ABC, abstractmethod, abstractproperty +from abc import ABC, abstractmethod from typing import OrderedDict, Tuple @@ -46,7 +46,8 @@ def remove(self, block_id: int): """Remove a given block id from the cache.""" pass - @abstractproperty + @property + @abstractmethod def num_blocks(self) -> int: pass diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index d67ce67172e30..4657757bd484b 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -1,4 +1,4 @@ -from abc import ABC, abstractmethod, abstractproperty +from abc import ABC, abstractmethod from contextlib import contextmanager from typing import Any, Dict, List, Literal, Optional, Set, Type, Union @@ -42,7 +42,8 @@ def dummy_lora_cache(self): yield self._cached_dummy_lora = False - @abstractproperty + @property + @abstractmethod def is_enabled(self) -> bool: ... From c2d6d2f960176491e0499656409f30b947ee8027 Mon Sep 17 00:00:00 2001 From: Daniil Arapov <59310708+Delviet@users.noreply.github.com> Date: Sun, 2 Jun 2024 01:53:52 +0300 Subject: [PATCH 16/19] [Bugfix]: Fix issues related to prefix caching example (#5177) (#5180) --- examples/offline_inference_with_prefix.py | 47 ++++++++++++++++++----- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py index 7ed0563f14e0e..166e98549b536 100644 --- a/examples/offline_inference_with_prefix.py +++ b/examples/offline_inference_with_prefix.py @@ -1,5 +1,8 @@ +from time import time + from vllm import LLM, SamplingParams +# Common prefix. prefix = ( "You are an expert school principal, skilled in effectively managing " "faculty and staff. Draft 10-15 questions for a potential first grade " @@ -18,36 +21,60 @@ "The capital of France is", "The future of AI is", ] + +generating_prompts = [prefix + prompt for prompt in prompts] + # Create a sampling params object. sampling_params = SamplingParams(temperature=0.0) # Create an LLM. -llm = LLM(model="facebook/opt-125m", enable_prefix_caching=True) +regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4) -generating_prompts = [prefix + prompt for prompt in prompts] +prefix_cached_llm = LLM(model="facebook/opt-125m", + enable_prefix_caching=True, + gpu_memory_utilization=0.4) +print("Results without `enable_prefix_caching`") # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. -outputs = llm.generate(generating_prompts, sampling_params) +start_time_regular = time() +outputs = regular_llm.generate(generating_prompts, sampling_params) +duration_regular = time() - start_time_regular + +regular_generated_texts = [] # Print the outputs. for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text + regular_generated_texts.append(generated_text) print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print("-" * 80) # The llm.generate call will batch all prompts and send the batch at once -# if resources allow. The prefix will only be cached after the first batch -# is processed, so we need to call generate once to calculate the prefix -# and cache it. -outputs = llm.generate(generating_prompts[0], sampling_params) +# if resources allow. +start_time_cached = time() +outputs = prefix_cached_llm.generate(generating_prompts, sampling_params) +duration_cached = time() - start_time_cached -# Subsequent batches can leverage the cached prefix -outputs = llm.generate(generating_prompts, sampling_params) +print("Results with `enable_prefix_caching`") -# Print the outputs. You should see the same outputs as before +cached_generated_texts = [] +# Print the outputs. You should see the same outputs as before. for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text + cached_generated_texts.append(generated_text) print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +print("-" * 80) + +# Compare the results and display the speedup +generated_same = all([ + regular_generated_texts[i] == cached_generated_texts[i] + for i in range(len(prompts)) +]) +print(f"Generated answers are the same: {generated_same}") + +speedup = round(duration_regular / duration_cached, 2) +print(f"Speed up of cached generation compared to the regular is: {speedup}") From 044793d8df6aeb5326b5992d0e60aa4457760e8a Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Sat, 1 Jun 2024 19:35:41 -0400 Subject: [PATCH 17/19] [BugFix] Prevent `LLM.encode` for non-generation Models (#5184) Co-authored-by: mgoin --- vllm/entrypoints/llm.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 6e971ae73f5d0..beee16d188eb5 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -276,6 +276,11 @@ def generate( considered legacy and may be deprecated in the future. You should instead pass them via the ``inputs`` parameter. """ + if self.llm_engine.model_config.embedding_mode: + raise ValueError( + "LLM.generate() is only supported for generation models " + "(XForCausalLM).") + if prompt_token_ids is not None or multi_modal_data is not None: inputs = self._convert_v1_inputs( prompts=cast(Optional[Union[str, List[str]]], prompts), @@ -420,6 +425,11 @@ def encode( considered legacy and may be deprecated in the future. You should instead pass them via the ``inputs`` parameter. """ + if not self.llm_engine.model_config.embedding_mode: + raise ValueError( + "LLM.encode() is only supported for embedding models (XModel)." + ) + if prompt_token_ids is not None or multi_modal_data is not None: inputs = self._convert_v1_inputs( prompts=cast(Optional[Union[str, List[str]]], prompts), From ed59a7ed23c6e91096ea82b03037e40b14b5375c Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Sat, 1 Jun 2024 21:21:53 -0500 Subject: [PATCH 18/19] Update test_ignore_eos (#4898) --- tests/samplers/test_ignore_eos.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py index 864657a3c2b28..67b5168bea0e6 100644 --- a/tests/samplers/test_ignore_eos.py +++ b/tests/samplers/test_ignore_eos.py @@ -7,25 +7,26 @@ from vllm import SamplingParams -MODELS = ["facebook/opt-125m"] +# We also test with llama because it has generation_config to specify EOS +# (past regression). +MODELS = ["facebook/opt-125m", "meta-llama/Llama-2-7b-hf"] @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [1024]) -def test_beam_search_single_input( +@pytest.mark.parametrize("max_tokens", [512]) +def test_ignore_eos( vllm_runner, example_prompts, model: str, dtype: str, max_tokens: int, ) -> None: - example_prompts = "1 + 1 is" - vllm_model = vllm_runner(model, dtype=dtype) sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True) - ignore_eos_output = vllm_model.model.generate( - example_prompts, sampling_params=sampling_params) - print(len(ignore_eos_output[0].outputs[0].token_ids)) - assert max_tokens - len(ignore_eos_output[0].outputs[0].token_ids) < 10 - assert max_tokens - len(ignore_eos_output[0].outputs[0].token_ids) >= 0 + + for prompt in example_prompts: + ignore_eos_output = vllm_model.model.generate( + prompt, sampling_params=sampling_params) + output_length = len(ignore_eos_output[0].outputs[0].token_ids) + assert output_length == max_tokens From f790ad3c50f050778af1fd31170746b7c68ca2fc Mon Sep 17 00:00:00 2001 From: Avinash Raj Date: Sun, 2 Jun 2024 13:36:13 +0530 Subject: [PATCH 19/19] [Frontend][OpenAI] Support for returning max_model_len on /v1/models response (#4643) --- vllm/entrypoints/openai/protocol.py | 1 + vllm/entrypoints/openai/serving_engine.py | 1 + 2 files changed, 2 insertions(+) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index e380212a4d76b..bbd61a2c5dd59 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -82,6 +82,7 @@ class ModelCard(OpenAIBaseModel): owned_by: str = "vllm" root: Optional[str] = None parent: Optional[str] = None + max_model_len: Optional[int] = None permission: List[ModelPermission] = Field(default_factory=list) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 066acdf1c019a..ae659d19c878b 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -62,6 +62,7 @@ async def show_available_models(self) -> ModelList: """Show available models. Right now we only have one model.""" model_cards = [ ModelCard(id=served_model_name, + max_model_len=self.max_model_len, root=self.served_model_names[0], permission=[ModelPermission()]) for served_model_name in self.served_model_names