Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor VLM modules / Add InternVit submodule support #11851

Open
wants to merge 37 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
d9b7520
reformat
yaoyu-33 Jan 8, 2025
12023b9
fix import
yaoyu-33 Jan 8, 2025
6db7636
rename to base.py
yaoyu-33 Jan 8, 2025
fc8e6da
fix few issues from importing
yaoyu-33 Jan 9, 2025
2385eaa
temp save for intern vit
yaoyu-33 Jan 9, 2025
96fd7be
save for partially working internvit
yaoyu-33 Jan 9, 2025
04c6796
added support for importing clip vit
yashaswikarnati Jan 10, 2025
5bd3723
Move projector
yaoyu-33 Jan 10, 2025
c8f11fb
fix intern_vit
yaoyu-33 Jan 10, 2025
9249af5
fix intern_vit conversion
yaoyu-33 Jan 10, 2025
105e455
update intern vit
yaoyu-33 Jan 13, 2025
3a855bc
update tp logic
yaoyu-33 Jan 14, 2025
12b3521
Apply isort and black reformatting
yaoyu-33 Jan 14, 2025
4a4a91b
Merge branch 'main' into yuya/refactor_vlm_vision_module
yaoyu-33 Jan 15, 2025
bc52537
Merge branch 'main' into yuya/refactor_vlm_vision_module
yaoyu-33 Jan 21, 2025
309461f
update logging
yaoyu-33 Jan 21, 2025
a610a88
Apply isort and black reformatting
yaoyu-33 Jan 21, 2025
e4fe166
update init / fix unused layer bug
yaoyu-33 Jan 22, 2025
c33bf50
Apply isort and black reformatting
yaoyu-33 Jan 22, 2025
edd6aab
remove not used import
yaoyu-33 Jan 22, 2025
4fad475
Merge remote-tracking branch 'origin/yuya/refactor_vlm_vision_module'…
yaoyu-33 Jan 22, 2025
1c47a2d
Update for Siglip
yaoyu-33 Jan 22, 2025
25f1247
Apply isort and black reformatting
yaoyu-33 Jan 22, 2025
e7600e3
update init
yaoyu-33 Jan 22, 2025
01e4cd7
Merge remote-tracking branch 'origin/yuya/refactor_vlm_vision_module'…
yaoyu-33 Jan 22, 2025
46b2332
Apply isort and black reformatting
yaoyu-33 Jan 22, 2025
c364196
Fix logging
yaoyu-33 Jan 23, 2025
cfff3e0
Apply isort and black reformatting
yaoyu-33 Jan 23, 2025
3fd87d3
Merge branch 'main' into yuya/refactor_vlm_vision_module
yaoyu-33 Feb 4, 2025
8d057f4
Apply isort and black reformatting
yaoyu-33 Feb 4, 2025
31975d9
pylint issues
yaoyu-33 Feb 5, 2025
595a844
Apply isort and black reformatting
yaoyu-33 Feb 5, 2025
43ea83c
pylint issues
yaoyu-33 Feb 5, 2025
a7f416d
Merge remote-tracking branch 'origin/yuya/refactor_vlm_vision_module'…
yaoyu-33 Feb 5, 2025
0fbbd3e
Apply isort and black reformatting
yaoyu-33 Feb 5, 2025
232f7cb
fix pylint
yaoyu-33 Feb 5, 2025
a7754c2
Merge remote-tracking branch 'origin/yuya/refactor_vlm_vision_module'…
yaoyu-33 Feb 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 30 additions & 8 deletions nemo/collections/vlm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,20 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# CLIP
from nemo.collections.vlm.clip.data import ClipMockDataModule
from nemo.collections.vlm.clip.model import ClipConfigB32, ClipConfigL14, CLIPModel

# HF
from nemo.collections.vlm.hf.data.hf_dataset import HFDatasetDataModule
from nemo.collections.vlm.hf.model.hf_auto_model_for_image_text_to_text import HFAutoModelForImageTextToText

# LLAVA_NEXT
from nemo.collections.vlm.llava_next.data import LlavaNextMockDataModule, LlavaNextTaskEncoder
from nemo.collections.vlm.llava_next.model.base import LlavaNextConfig
from nemo.collections.vlm.llava_next.model.llava_next import LlavaNextConfig7B, LlavaNextConfig13B, LlavaNextModel

# MLLAMA
from nemo.collections.vlm.mllama.data import MLlamaLazyDataModule, MLlamaMockDataModule
from nemo.collections.vlm.mllama.model.base import (
CrossAttentionTextConfig,
Expand All @@ -32,6 +39,8 @@
MLlamaConfig90B,
MLlamaConfig90BInstruct,
)

# NEVA
from nemo.collections.vlm.neva.data import (
DataConfig,
ImageDataConfig,
Expand All @@ -42,17 +51,27 @@
VideoDataConfig,
VideoToken,
)
from nemo.collections.vlm.neva.model.base import (
from nemo.collections.vlm.neva.model.base import NevaConfig, NevaModel
from nemo.collections.vlm.neva.model.llava import Llava15Config7B, Llava15Config13B, LlavaConfig, LlavaModel

# PEFT
from nemo.collections.vlm.peft import LoRA

# RECIPES
from nemo.collections.vlm.recipes import *

# VISION
from nemo.collections.vlm.vision import (
CLIPViTConfig,
CLIPViTL_14_336_Config,
HFCLIPVisionConfig,
InternViT_6B_448px_Config,
InternViT_300M_448px_Config,
InternViTModel,
MultimodalProjectorConfig,
NevaConfig,
NevaModel,
SigLIPViT400M_14_384_Config,
SigLIPViTModel,
)
from nemo.collections.vlm.neva.model.llava import Llava15Config7B, Llava15Config13B, LlavaConfig, LlavaModel
from nemo.collections.vlm.neva.model.vit_config import CLIPViTL_14_336_Config, SigLIPViT400M_14_384_Config
from nemo.collections.vlm.peft import LoRA
from nemo.collections.vlm.recipes import *

__all__ = [
"HFDatasetDataModule",
Expand All @@ -70,6 +89,7 @@
"CLIPViTConfig",
"HFCLIPVisionConfig",
"CLIPViTL_14_336_Config",
"SigLIPViTModel",
"SigLIPViT400M_14_384_Config",
"MultimodalProjectorConfig",
"NevaConfig",
Expand All @@ -95,7 +115,9 @@
"LlavaNextConfig13B",
"LlavaNextModel",
"LlavaNextMockDataModule",
"LlavaNextTaskEncoder",
"InternViTModel",
"InternViT_300M_448px_Config",
"InternViT_6B_448px_Config",
"CLIPModel",
"LoRA",
"ClipConfigL14",
Expand Down
3 changes: 2 additions & 1 deletion nemo/collections/vlm/llava_next/model/llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@
from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
from nemo.collections.llm import Llama2Config7B, Llama2Config13B, LlamaConfig
from nemo.collections.vlm.llava_next.model.base import LlavaNextConfig, MCoreLlavaNextModel
from nemo.collections.vlm.neva.model.base import HFCLIPVisionConfig, MultimodalProjectorConfig, NevaModel
from nemo.collections.vlm.neva.model.base import NevaModel
from nemo.collections.vlm.neva.model.llava import HFLlavaImporter
from nemo.collections.vlm.vision.base import HFCLIPVisionConfig, MultimodalProjectorConfig
from nemo.lightning import OptimizerModule, io, teardown
from nemo.lightning.pytorch.optim import MegatronOptimizerModule, OptimizerModule

Expand Down
18 changes: 16 additions & 2 deletions nemo/collections/vlm/neva/data/lazy.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=C0115,C0116

import json
import logging
Expand Down Expand Up @@ -355,7 +356,20 @@ def _tokenize_and_label(self, conversations):
return_tensors="pt",
)[0]
answer_start, answer_end = find_pattern_indices(tokens, answer_tokens, search_start_index)
assert answer_start > 0, "Not found valid answer in conversation."
if answer_start < 0:
logging.warning(
"Unable to find a valid answer in the conversation. "
"Details: "
"\n- Messages: %s"
"\n- Tokens: %s"
"\n- Answer Tokens: %s"
"\n- Search Start Index: %d",
self.conv.messages,
tokens,
answer_tokens,
search_start_index,
)
break
labels[answer_start:answer_end] = tokens[answer_start:answer_end]
search_start_index = answer_end
tokens = tokens[:-1]
Expand Down Expand Up @@ -527,7 +541,7 @@ def __init__(
self.init_global_step = 0

if tokenizer is None or image_processor is None:
logging.warning(f"Processor and tokenizer are not provided! Fall back to `llava-hf/llava-1.5-7b-hf`.")
logging.warning("Processor and tokenizer are not provided! Fall back to `llava-hf/llava-1.5-7b-hf`.")
from transformers import AutoProcessor
from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer

Expand Down
8 changes: 7 additions & 1 deletion nemo/collections/vlm/neva/data/multimodal_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ class MultiModalToken:

@dataclass
class ImageToken(MultiModalToken):
"""Image Token class"""

token_str: str = "<image>"
token_index: int = -200
media_type: str = "image"
Expand All @@ -39,6 +41,8 @@ class ImageToken(MultiModalToken):

@dataclass
class VideoToken(MultiModalToken):
"""Video Token class"""

token_str: str = "<video>"
token_index: int = -300
media_type: str = "video"
Expand All @@ -49,4 +53,6 @@ class VideoToken(MultiModalToken):
IGNORE_INDEX = -100
IMAGE_TOKEN_INDEX = ImageToken.token_index
VIDEO_TOKEN_INDEX = VideoToken.token_index
SPECIAL_TOKEN_MAP = [(ImageToken.token_str, ImageToken.token_index), (VideoToken.token_str, VideoToken.token_index)]
SPECIAL_TOKEN_MAP = [
(ImageToken.token_str, ImageToken.token_index),
]
14 changes: 1 addition & 13 deletions nemo/collections/vlm/neva/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from nemo.collections.vlm.neva.model.base import (
CLIPViTConfig,
HFCLIPVisionConfig,
MultimodalProjectorConfig,
NevaConfig,
NevaModel,
)
from nemo.collections.vlm.neva.model.base import NevaConfig, NevaModel
from nemo.collections.vlm.neva.model.llava import Llava15Config7B, Llava15Config13B, LlavaConfig, LlavaModel
from nemo.collections.vlm.neva.model.vit_config import CLIPViTL_14_336_Config, SigLIPViT400M_14_384_Config

__all__ = [
"CLIPViTConfig",
"CLIPViTL_14_336_Config",
"SigLIPViT400M_14_384_Config",
"HFCLIPVisionConfig",
"MultimodalProjectorConfig",
"NevaConfig",
"NevaModel",
"LlavaConfig",
Expand Down
Loading
Loading