diff --git a/nemo/collections/vlm/__init__.py b/nemo/collections/vlm/__init__.py index c333162e84b8..97b154085f4b 100644 --- a/nemo/collections/vlm/__init__.py +++ b/nemo/collections/vlm/__init__.py @@ -12,13 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. +# CLIP from nemo.collections.vlm.clip.data import ClipMockDataModule from nemo.collections.vlm.clip.model import CLIPConfigB32, CLIPConfigL14, CLIPModel + +# HF from nemo.collections.vlm.hf.data.hf_dataset import HFDatasetDataModule from nemo.collections.vlm.hf.model.hf_auto_model_for_image_text_to_text import HFAutoModelForImageTextToText + +# LLAVA_NEXT from nemo.collections.vlm.llava_next.data import LlavaNextMockDataModule, LlavaNextTaskEncoder from nemo.collections.vlm.llava_next.model.base import LlavaNextConfig from nemo.collections.vlm.llava_next.model.llava_next import LlavaNextConfig7B, LlavaNextConfig13B, LlavaNextModel + +# MLLAMA from nemo.collections.vlm.mllama.data import MLlamaLazyDataModule, MLlamaMockDataModule from nemo.collections.vlm.mllama.model.base import ( CrossAttentionTextConfig, @@ -32,6 +39,8 @@ MLlamaConfig90B, MLlamaConfig90BInstruct, ) + +# NEVA from nemo.collections.vlm.neva.data import ( DataConfig, ImageDataConfig, @@ -42,17 +51,27 @@ VideoDataConfig, VideoToken, ) -from nemo.collections.vlm.neva.model.base import ( +from nemo.collections.vlm.neva.model.base import NevaConfig, NevaModel +from nemo.collections.vlm.neva.model.llava import Llava15Config7B, Llava15Config13B, LlavaConfig, LlavaModel + +# PEFT +from nemo.collections.vlm.peft import LoRA + +# RECIPES +from nemo.collections.vlm.recipes import * + +# VISION +from nemo.collections.vlm.vision import ( CLIPViTConfig, + CLIPViTL_14_336_Config, HFCLIPVisionConfig, + InternViT_6B_448px_Config, + InternViT_300M_448px_Config, + InternViTModel, MultimodalProjectorConfig, - NevaConfig, - NevaModel, + SigLIPViT400M_14_384_Config, + SigLIPViTModel, ) -from nemo.collections.vlm.neva.model.llava import Llava15Config7B, Llava15Config13B, LlavaConfig, LlavaModel -from nemo.collections.vlm.neva.model.vit_config import CLIPViTL_14_336_Config, SigLIPViT400M_14_384_Config -from nemo.collections.vlm.peft import LoRA -from nemo.collections.vlm.recipes import * __all__ = [ "HFDatasetDataModule", @@ -70,6 +89,7 @@ "CLIPViTConfig", "HFCLIPVisionConfig", "CLIPViTL_14_336_Config", + "SigLIPViTModel", "SigLIPViT400M_14_384_Config", "MultimodalProjectorConfig", "NevaConfig", @@ -95,7 +115,9 @@ "LlavaNextConfig13B", "LlavaNextModel", "LlavaNextMockDataModule", - "LlavaNextTaskEncoder", + "InternViTModel", + "InternViT_300M_448px_Config", + "InternViT_6B_448px_Config", "CLIPModel", "LoRA", "CLIPConfigL14", diff --git a/nemo/collections/vlm/llava_next/model/llava_next.py b/nemo/collections/vlm/llava_next/model/llava_next.py index fac5d5dd0871..8621e1e3493e 100644 --- a/nemo/collections/vlm/llava_next/model/llava_next.py +++ b/nemo/collections/vlm/llava_next/model/llava_next.py @@ -26,8 +26,9 @@ from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec from nemo.collections.llm import Llama2Config7B, Llama2Config13B, LlamaConfig from nemo.collections.vlm.llava_next.model.base import LlavaNextConfig, MCoreLlavaNextModel -from nemo.collections.vlm.neva.model.base import HFCLIPVisionConfig, MultimodalProjectorConfig, NevaModel +from nemo.collections.vlm.neva.model.base import NevaModel from nemo.collections.vlm.neva.model.llava import HFLlavaImporter +from nemo.collections.vlm.vision.base import HFCLIPVisionConfig, MultimodalProjectorConfig from nemo.lightning import OptimizerModule, io, teardown from nemo.lightning.pytorch.optim import MegatronOptimizerModule, OptimizerModule diff --git a/nemo/collections/vlm/neva/data/lazy.py b/nemo/collections/vlm/neva/data/lazy.py index 90199d3c6d30..0076d3439270 100644 --- a/nemo/collections/vlm/neva/data/lazy.py +++ b/nemo/collections/vlm/neva/data/lazy.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# pylint: disable=C0115,C0116 import json import logging @@ -355,7 +356,20 @@ def _tokenize_and_label(self, conversations): return_tensors="pt", )[0] answer_start, answer_end = find_pattern_indices(tokens, answer_tokens, search_start_index) - assert answer_start > 0, "Not found valid answer in conversation." + if answer_start < 0: + logging.warning( + "Unable to find a valid answer in the conversation. " + "Details: " + "\n- Messages: %s" + "\n- Tokens: %s" + "\n- Answer Tokens: %s" + "\n- Search Start Index: %d", + self.conv.messages, + tokens, + answer_tokens, + search_start_index, + ) + break labels[answer_start:answer_end] = tokens[answer_start:answer_end] search_start_index = answer_end tokens = tokens[:-1] @@ -527,7 +541,7 @@ def __init__( self.init_global_step = 0 if tokenizer is None or image_processor is None: - logging.warning(f"Processor and tokenizer are not provided! Fall back to `llava-hf/llava-1.5-7b-hf`.") + logging.warning("Processor and tokenizer are not provided! Fall back to `llava-hf/llava-1.5-7b-hf`.") from transformers import AutoProcessor from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer diff --git a/nemo/collections/vlm/neva/data/multimodal_tokens.py b/nemo/collections/vlm/neva/data/multimodal_tokens.py index 8c4dcadad63c..fb25e9345225 100644 --- a/nemo/collections/vlm/neva/data/multimodal_tokens.py +++ b/nemo/collections/vlm/neva/data/multimodal_tokens.py @@ -31,6 +31,8 @@ class MultiModalToken: @dataclass class ImageToken(MultiModalToken): + """Image Token class""" + token_str: str = "" token_index: int = -200 media_type: str = "image" @@ -39,6 +41,8 @@ class ImageToken(MultiModalToken): @dataclass class VideoToken(MultiModalToken): + """Video Token class""" + token_str: str = "