Skip to content

Commit

Permalink
Limit number of dummy cross attention blocks (#667)
Browse files Browse the repository at this point in the history
Fix warmup for encoder-decoder model runner by limiting the number of
dummy cross attention blocks to available blocks. Without this we will
encounter an error in CrossAttention due to lack of available blocks.
  • Loading branch information
kdamaszk authored Jan 8, 2025
1 parent cccf363 commit fa9dbf2
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion vllm/worker/hpu_enc_dec_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,6 @@ def create_dummy_seq_group_metadata(self,
num_images = mm_counts["image"]
max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
self.model_config) * num_images
num_cross_blocks = math.ceil(max_mm_tokens / self.block_size)
seq_len = max(seq_len, 1)
if is_prompt:
input_len = seq_len
Expand All @@ -437,6 +436,9 @@ def create_dummy_seq_group_metadata(self,
input_len = seq_len - 1
output_len = 1
block_tables = {group_id: [_PAD_BLOCK_ID] * num_blocks}
# limit cross blocks to the number of available blocks
num_cross_blocks = min(self.bucketing_ctx.num_hpu_blocks,
max_mm_tokens) // self.block_size
cross_block_table = [_PAD_BLOCK_ID] * num_cross_blocks
prompt_token_ids = [0] * input_len
output_token_ids = [1] * output_len
Expand Down

0 comments on commit fa9dbf2

Please sign in to comment.