Limit number of dummy cross attention blocks (#667)

Fix warmup for encoder-decoder model runner by limiting the number of dummy cross attention blocks to available blocks. Without this we will encounter an error in CrossAttention due to lack of available blocks.
HabanaAI · Jan 8, 2025 · fa9dbf2 · fa9dbf2
1 parent cccf363
commit fa9dbf2
Showing 1 changed file with 3 additions and 1 deletion.
diff --git a/vllm/worker/hpu_enc_dec_model_runner.py b/vllm/worker/hpu_enc_dec_model_runner.py
@@ -426,7 +426,6 @@ def create_dummy_seq_group_metadata(self,
         num_images = mm_counts["image"]
         max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
             self.model_config) * num_images
-        num_cross_blocks = math.ceil(max_mm_tokens / self.block_size)
         seq_len = max(seq_len, 1)
         if is_prompt:
             input_len = seq_len
@@ -437,6 +436,9 @@ def create_dummy_seq_group_metadata(self,
             input_len = seq_len - 1
             output_len = 1
             block_tables = {group_id: [_PAD_BLOCK_ID] * num_blocks}
+            # limit cross blocks to the number of available blocks
+            num_cross_blocks = min(self.bucketing_ctx.num_hpu_blocks,
+                                   max_mm_tokens) // self.block_size
             cross_block_table = [_PAD_BLOCK_ID] * num_cross_blocks
         prompt_token_ids = [0] * input_len
         output_token_ids = [1] * output_len