Skip to content

Commit

Permalink
[Bugfix] Fix crash with llama 3.2 vision models and guided decoding (#…
Browse files Browse the repository at this point in the history
…9631)

Signed-off-by: Travis Johnson <[email protected]>
Co-authored-by: pavlo-ruban <[email protected]>
Co-authored-by: Nick Hill <[email protected]>
  • Loading branch information
3 people authored Oct 25, 2024
1 parent 228cfbd commit 6567e13
Showing 1 changed file with 11 additions and 3 deletions.
14 changes: 11 additions & 3 deletions vllm/model_executor/guided_decoding/outlines_logits_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
# limitations under the License.
import copy
import json
import math
from collections import defaultdict
from functools import lru_cache
from typing import Callable, DefaultDict, Dict, List, Union

import numpy as np
import torch
from lark import Lark
from outlines import grammars
Expand Down Expand Up @@ -77,9 +77,17 @@ def __call__(self, input_ids: List[int],
f"Unsupported instruction type {type(instruction)}")

mask = torch.full((scores.shape[-1], ),
-math.inf,
-torch.inf,
device=scores.device)
mask[allowed_tokens] = 0
# The tokenizer may support more token ids than the model can generate,
# eg. Llama 3.2 Vision models have an `<|image|>` token with id 128256
# but scores.shape == torch.Size([128256])
# Using NumPy is faster for filtering token ids
allowed_tokens = np.array(allowed_tokens, dtype=np.int64)
allowed_tokens = torch.tensor(allowed_tokens, device=scores.device)
allowed_tokens = allowed_tokens.masked_select(
allowed_tokens < scores.shape[-1])
mask.index_fill_(0, allowed_tokens, 0)
scores.add_(mask)
return scores

Expand Down

0 comments on commit 6567e13

Please sign in to comment.