diff --git a/nemo/collections/vlm/__init__.py b/nemo/collections/vlm/__init__.py
index c333162e84b8..97b154085f4b 100644
--- a/nemo/collections/vlm/__init__.py
+++ b/nemo/collections/vlm/__init__.py
@@ -12,13 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# CLIP
 from nemo.collections.vlm.clip.data import ClipMockDataModule
 from nemo.collections.vlm.clip.model import CLIPConfigB32, CLIPConfigL14, CLIPModel
+
+# HF
 from nemo.collections.vlm.hf.data.hf_dataset import HFDatasetDataModule
 from nemo.collections.vlm.hf.model.hf_auto_model_for_image_text_to_text import HFAutoModelForImageTextToText
+
+# LLAVA_NEXT
 from nemo.collections.vlm.llava_next.data import LlavaNextMockDataModule, LlavaNextTaskEncoder
 from nemo.collections.vlm.llava_next.model.base import LlavaNextConfig
 from nemo.collections.vlm.llava_next.model.llava_next import LlavaNextConfig7B, LlavaNextConfig13B, LlavaNextModel
+
+# MLLAMA
 from nemo.collections.vlm.mllama.data import MLlamaLazyDataModule, MLlamaMockDataModule
 from nemo.collections.vlm.mllama.model.base import (
     CrossAttentionTextConfig,
@@ -32,6 +39,8 @@
     MLlamaConfig90B,
     MLlamaConfig90BInstruct,
 )
+
+# NEVA
 from nemo.collections.vlm.neva.data import (
     DataConfig,
     ImageDataConfig,
@@ -42,17 +51,27 @@
     VideoDataConfig,
     VideoToken,
 )
-from nemo.collections.vlm.neva.model.base import (
+from nemo.collections.vlm.neva.model.base import NevaConfig, NevaModel
+from nemo.collections.vlm.neva.model.llava import Llava15Config7B, Llava15Config13B, LlavaConfig, LlavaModel
+
+# PEFT
+from nemo.collections.vlm.peft import LoRA
+
+# RECIPES
+from nemo.collections.vlm.recipes import *
+
+# VISION
+from nemo.collections.vlm.vision import (
     CLIPViTConfig,
+    CLIPViTL_14_336_Config,
     HFCLIPVisionConfig,
+    InternViT_6B_448px_Config,
+    InternViT_300M_448px_Config,
+    InternViTModel,
     MultimodalProjectorConfig,
-    NevaConfig,
-    NevaModel,
+    SigLIPViT400M_14_384_Config,
+    SigLIPViTModel,
 )
-from nemo.collections.vlm.neva.model.llava import Llava15Config7B, Llava15Config13B, LlavaConfig, LlavaModel
-from nemo.collections.vlm.neva.model.vit_config import CLIPViTL_14_336_Config, SigLIPViT400M_14_384_Config
-from nemo.collections.vlm.peft import LoRA
-from nemo.collections.vlm.recipes import *
 
 __all__ = [
     "HFDatasetDataModule",
@@ -70,6 +89,7 @@
     "CLIPViTConfig",
     "HFCLIPVisionConfig",
     "CLIPViTL_14_336_Config",
+    "SigLIPViTModel",
     "SigLIPViT400M_14_384_Config",
     "MultimodalProjectorConfig",
     "NevaConfig",
@@ -95,7 +115,9 @@
     "LlavaNextConfig13B",
     "LlavaNextModel",
     "LlavaNextMockDataModule",
-    "LlavaNextTaskEncoder",
+    "InternViTModel",
+    "InternViT_300M_448px_Config",
+    "InternViT_6B_448px_Config",
     "CLIPModel",
     "LoRA",
     "CLIPConfigL14",
diff --git a/nemo/collections/vlm/llava_next/model/llava_next.py b/nemo/collections/vlm/llava_next/model/llava_next.py
index fac5d5dd0871..8621e1e3493e 100644
--- a/nemo/collections/vlm/llava_next/model/llava_next.py
+++ b/nemo/collections/vlm/llava_next/model/llava_next.py
@@ -26,8 +26,9 @@
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 from nemo.collections.llm import Llama2Config7B, Llama2Config13B, LlamaConfig
 from nemo.collections.vlm.llava_next.model.base import LlavaNextConfig, MCoreLlavaNextModel
-from nemo.collections.vlm.neva.model.base import HFCLIPVisionConfig, MultimodalProjectorConfig, NevaModel
+from nemo.collections.vlm.neva.model.base import NevaModel
 from nemo.collections.vlm.neva.model.llava import HFLlavaImporter
+from nemo.collections.vlm.vision.base import HFCLIPVisionConfig, MultimodalProjectorConfig
 from nemo.lightning import OptimizerModule, io, teardown
 from nemo.lightning.pytorch.optim import MegatronOptimizerModule, OptimizerModule
 
diff --git a/nemo/collections/vlm/neva/data/lazy.py b/nemo/collections/vlm/neva/data/lazy.py
index 90199d3c6d30..0076d3439270 100644
--- a/nemo/collections/vlm/neva/data/lazy.py
+++ b/nemo/collections/vlm/neva/data/lazy.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# pylint: disable=C0115,C0116
 
 import json
 import logging
@@ -355,7 +356,20 @@ def _tokenize_and_label(self, conversations):
                 return_tensors="pt",
             )[0]
             answer_start, answer_end = find_pattern_indices(tokens, answer_tokens, search_start_index)
-            assert answer_start > 0, "Not found valid answer in conversation."
+            if answer_start < 0:
+                logging.warning(
+                    "Unable to find a valid answer in the conversation. "
+                    "Details: "
+                    "\n- Messages: %s"
+                    "\n- Tokens: %s"
+                    "\n- Answer Tokens: %s"
+                    "\n- Search Start Index: %d",
+                    self.conv.messages,
+                    tokens,
+                    answer_tokens,
+                    search_start_index,
+                )
+                break
             labels[answer_start:answer_end] = tokens[answer_start:answer_end]
             search_start_index = answer_end
         tokens = tokens[:-1]
@@ -527,7 +541,7 @@ def __init__(
         self.init_global_step = 0
 
         if tokenizer is None or image_processor is None:
-            logging.warning(f"Processor and tokenizer are not provided! Fall back to `llava-hf/llava-1.5-7b-hf`.")
+            logging.warning("Processor and tokenizer are not provided! Fall back to `llava-hf/llava-1.5-7b-hf`.")
             from transformers import AutoProcessor
             from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 
diff --git a/nemo/collections/vlm/neva/data/multimodal_tokens.py b/nemo/collections/vlm/neva/data/multimodal_tokens.py
index 8c4dcadad63c..fb25e9345225 100644
--- a/nemo/collections/vlm/neva/data/multimodal_tokens.py
+++ b/nemo/collections/vlm/neva/data/multimodal_tokens.py
@@ -31,6 +31,8 @@ class MultiModalToken:
 
 @dataclass
 class ImageToken(MultiModalToken):
+    """Image Token class"""
+
     token_str: str = "<image>"
     token_index: int = -200
     media_type: str = "image"
@@ -39,6 +41,8 @@ class ImageToken(MultiModalToken):
 
 @dataclass
 class VideoToken(MultiModalToken):
+    """Video Token class"""
+
     token_str: str = "<video>"
     token_index: int = -300
     media_type: str = "video"
@@ -49,4 +53,6 @@ class VideoToken(MultiModalToken):
 IGNORE_INDEX = -100
 IMAGE_TOKEN_INDEX = ImageToken.token_index
 VIDEO_TOKEN_INDEX = VideoToken.token_index
-SPECIAL_TOKEN_MAP = [(ImageToken.token_str, ImageToken.token_index), (VideoToken.token_str, VideoToken.token_index)]
+SPECIAL_TOKEN_MAP = [
+    (ImageToken.token_str, ImageToken.token_index),
+]
diff --git a/nemo/collections/vlm/neva/model/__init__.py b/nemo/collections/vlm/neva/model/__init__.py
index 99862f97b9ed..464160d252b8 100644
--- a/nemo/collections/vlm/neva/model/__init__.py
+++ b/nemo/collections/vlm/neva/model/__init__.py
@@ -12,22 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from nemo.collections.vlm.neva.model.base import (
-    CLIPViTConfig,
-    HFCLIPVisionConfig,
-    MultimodalProjectorConfig,
-    NevaConfig,
-    NevaModel,
-)
+from nemo.collections.vlm.neva.model.base import NevaConfig, NevaModel
 from nemo.collections.vlm.neva.model.llava import Llava15Config7B, Llava15Config13B, LlavaConfig, LlavaModel
-from nemo.collections.vlm.neva.model.vit_config import CLIPViTL_14_336_Config, SigLIPViT400M_14_384_Config
 
 __all__ = [
-    "CLIPViTConfig",
-    "CLIPViTL_14_336_Config",
-    "SigLIPViT400M_14_384_Config",
-    "HFCLIPVisionConfig",
-    "MultimodalProjectorConfig",
     "NevaConfig",
     "NevaModel",
     "LlavaConfig",
diff --git a/nemo/collections/vlm/neva/model/base.py b/nemo/collections/vlm/neva/model/base.py
index ecda4d9e0c02..504e75f39a51 100644
--- a/nemo/collections/vlm/neva/model/base.py
+++ b/nemo/collections/vlm/neva/model/base.py
@@ -12,55 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import re
 from dataclasses import dataclass
-from typing import Callable, Dict, List, Optional, Union
+from typing import Callable, Dict, List, Optional
 
 import lightning.pytorch as L
 import torch
 import torch.distributed
-import torch.nn.functional as F
 from megatron.core import InferenceParams, dist_checkpointing
 from megatron.core import parallel_state as ps
 from megatron.core import tensor_parallel
 from megatron.core.enums import ModelType
 from megatron.core.models.multimodal.llava_model import LLaVAModel as MCoreLLaVAModel
-from megatron.core.models.vision.clip_vit_model import CLIPViTModel as MCoreCLIPViTModel
-from megatron.core.models.vision.multimodal_projector import MultimodalProjector as MCoreMultimodalProjector
 from megatron.core.optimizer import OptimizerConfig
 from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.tensor_parallel import gather_from_sequence_parallel_region
-
-HAVE_TE = True
-try:
-    from megatron.core.transformer.custom_layers.transformer_engine import (
-        TEColumnParallelLinear,
-        TENorm,
-        TERowParallelLinear,
-    )
-except ImportError:
-    from nemo.utils import logging
-
-    # These Defaults are needed to make sure the code compiles
-    TEColumnParallelLinear = None
-    TENorm = None
-    TERowParallelLinear = None
-    logging.warning(
-        "Failed to import Transformer Engine dependencies. "
-        "`from megatron.core.transformer.custom_layers.transformer_engine import *`"
-        "If using NeMo Run, this is expected. Otherwise, please verify the Transformer Engine installation."
-    )
-
-from megatron.core.transformer.mlp import MLP, MLPSubmodules
-from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_config import TransformerConfig
 from torch import nn
-from transformers import CLIPVisionConfig, CLIPVisionModel
 
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 from nemo.collections.llm import fn
-from nemo.collections.llm.gpt.model import transformer_engine_layer_spec
+
 from nemo.collections.vlm.neva.data.multimodal_tokens import IGNORE_INDEX, IMAGE_TOKEN_INDEX
 from nemo.lightning import io
 from nemo.lightning.io.pl import ckpt_to_weights_subdir
@@ -105,7 +76,29 @@ def get_image_sequence_length(img_h, img_w, patch_dim, add_class_token, class_to
     return num_patches + (class_token_len if add_class_token else 0)
 
 
+def restore_model_weights(model, checkpoint_path, strict=False):
+    """
+    Restores model weights from a checkpoint.
+
+    Args:
+        model: The model to restore weights for.
+        checkpoint_path: Path to the checkpoint.
+        strict: Whether to restore weights even if they are not the same.
+    """
+    if checkpoint_path is not None:
+        sharded_state_dict = dict(state_dict=model.sharded_state_dict(prefix="module."))
+        loaded_state_dict = dist_checkpointing.load(
+            sharded_state_dict=sharded_state_dict,
+            checkpoint_dir=ckpt_to_weights_subdir(checkpoint_path, is_saving=False),
+            validate_access_integrity=False,
+            **({"strict": "log_all"} if not strict else {}),
+        )
+        loaded_state_dict = {k.removeprefix("module."): v for k, v in loaded_state_dict["state_dict"].items()}
+        model.load_state_dict(loaded_state_dict, strict=strict)
+
+
 def neva_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
+    """Neva Data Step"""
     from megatron.core import parallel_state
 
     # Based on: https://github.com/NVIDIA/Megatron-LM/blob/main/pretrain_gpt.py#L87
@@ -157,6 +150,7 @@ def neva_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
 
 
 def neva_forward_step(model, batch) -> torch.Tensor:
+    """Neva Forward Step"""
     forward_args = {
         "images": batch["media"],
         "input_ids": batch["tokens"],
@@ -172,155 +166,10 @@ def neva_forward_step(model, batch) -> torch.Tensor:
     return model(**forward_args)
 
 
-def set_input_tensor(self, tensor):
-    pass
-
-
-@dataclass
-class MultimodalProjectorConfig(TransformerConfig, io.IOMixin):
-    """
-    For MLP, fc1 in shape of input_size, ffn_hidden_size, fc2 in shape of ffn_hidden_size, hidden_size
-    """
-
-    projector_type: str = "mlp2x_gelu"
-    layer_spec: Optional[MLPSubmodules] = None
-    input_size: Optional[int] = 1024
-    hidden_size: int = 1024
-    ffn_hidden_size: int = 1024
-    activation_func: Callable = F.gelu
-    bias: bool = True
-    bias_activation_fusion: bool = True
-    num_layers: int = 1  # placeholder, NOT used!
-    num_attention_heads: int = 8  # placeholder, NOT used!
-
-    def configure_model(self) -> "MCoreMultimodalProjector":
-        if self.projector_type.startswith("mcore") and self.layer_spec is None:
-            if self.projector_type == "mcore_mlp":
-                self.projector_type = "mlp"  # strip "mcore_" for mcore init
-                self.layer_spec = ModuleSpec(
-                    module=MLP,
-                    submodules=MLPSubmodules(
-                        linear_fc1=TEColumnParallelLinear,
-                        linear_fc2=TERowParallelLinear,
-                    ),
-                )
-                self.layer_spec = self.layer_spec.submodules
-            elif self.projector_type == "mcore_affine":
-                self.projector_type = "affine"  # strip "mcore_" for mcore init
-                self.layer_spec = MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=None)
-            else:
-                raise NotImplementedError(f"Not supported projector type `{self.projector_type}`")
-
-            return MCoreMultimodalProjector(
-                self,
-                self.layer_spec,
-                projector_type=self.projector_type,
-                input_size=self.input_size,
-            )
-
-        # e.g. "mlp2x_gelu"
-        mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', self.projector_type)
-        if mlp_gelu_match:
-            mlp_depth = int(mlp_gelu_match.group(1))
-            modules = [torch.nn.Linear(self.input_size, self.hidden_size, bias=True)]
-            for _ in range(1, mlp_depth):
-                modules.append(torch.nn.GELU())
-                modules.append(torch.nn.Linear(self.hidden_size, self.hidden_size, bias=True))
-            model = torch.nn.Sequential(*modules)
-            from types import MethodType
-
-            model.set_input_tensor = MethodType(set_input_tensor, model)
-        else:
-            raise NotImplementedError(f"Not supported projector type `{self.projector_type}`")
-
-        return model
-
-
-@dataclass
-class HFCLIPVisionConfig(CLIPVisionConfig, io.IOMixin):
-    """
-    https://github.com/huggingface/transformers/blob/v4.44.0/src/transformers/models/clip/configuration_clip.py#L261
-    """
-
-    hidden_size: int = 1024
-    num_image_embeddings_per_tile: Optional[int] = None
-    pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None
-
-    def __post_init__(self, *args, **kwargs) -> None:
-        CLIPVisionConfig.__init__(self, *args, **kwargs, hidden_size=self.hidden_size)
-        if self.pretrained_model_name_or_path is not None:
-            config = CLIPVisionConfig.from_pretrained(self.pretrained_model_name_or_path)
-            for key, value in config.to_dict().items():
-                setattr(self, key, value)
-        self.num_image_embeddings_per_tile = get_image_sequence_length(
-            img_h=self.image_size,
-            img_w=self.image_size,
-            patch_dim=self.patch_size,
-            add_class_token=False,
-            class_token_len=1,
-        )
-
-    def configure_model(self) -> "CLIPVisionModel":
-        # Monkey patch the method to the vision encoder
-        CLIPVisionModel.set_input_tensor = set_input_tensor
-
-        if self.pretrained_model_name_or_path is None:
-            model = CLIPVisionModel(self)
-        else:
-            model = CLIPVisionModel.from_pretrained(self.pretrained_model_name_or_path)
-        return model
-
-
-@dataclass
-class CLIPViTConfig(TransformerConfig, io.IOMixin):
-    ln_pre_impl: Union[ModuleSpec, type] = TENorm
-    ln_post_impl: Union[ModuleSpec, type] = TENorm
-    add_class_token: bool = True
-    class_token_len: int = 1
-    patch_dim: int = 14
-    img_h: int = 336
-    img_w: int = 336
-    vision_model_type: str = "clip"  # ["clip", "siglip"]
-    num_image_embeddings_per_tile: Optional[int] = None
-    transformer_layer_spec: ModuleSpec = transformer_engine_layer_spec
-
-    num_layers: int = 1  # Placeholder, NOT used!
-    num_attention_heads: int = 8  # Placeholder, NOT used!
-
-    def __post_init__(self):
-        if self.vision_model_type == "siglip":
-            self.add_class_token = False
-            self.class_token_len = 0
-        self.num_image_embeddings_per_tile = get_image_sequence_length(
-            img_h=self.img_h,
-            img_w=self.img_w,
-            patch_dim=self.patch_dim,
-            add_class_token=self.add_class_token,
-            class_token_len=self.class_token_len,
-        )
-
-    def configure_model(self) -> "CLIPViTModel":
-        transformer_layer_spec = self.transformer_layer_spec
-        if not isinstance(transformer_layer_spec, ModuleSpec):
-            from nemo.collections.vlm.layer_specs import get_layer_spec_te
-
-            transformer_layer_spec = get_layer_spec_te(is_vit=True)
-        return CLIPViTModel(
-            self,
-            transformer_layer_spec,
-            ln_pre_impl=self.ln_pre_impl,
-            ln_post_impl=self.ln_post_impl,
-            add_class_token=self.add_class_token,
-            class_token_len=self.class_token_len,
-            patch_dim=self.patch_dim,
-            img_h=self.img_h,
-            img_w=self.img_w,
-            model_subtype=self.vision_model_type,
-        )
-
-
 @dataclass
 class NevaConfig(TransformerConfig, io.IOMixin):
+    """Neva Model Base Config"""
+
     language_transformer_config: Optional[TransformerConfig] = None
     vision_transformer_config: Optional[TransformerConfig] = None
     vision_projection_config: Optional[TransformerConfig] = None
@@ -336,7 +185,7 @@ class NevaConfig(TransformerConfig, io.IOMixin):
     seq_length: int = 1024
 
     language_model_from_pretrained: Optional[str] = None
-    vision_model_from_pretrained: Optional[str] = None  # TODO
+    vision_model_from_pretrained: Optional[str] = None
     vision_projection_from_pretrained: Optional[str] = None  # TODO
 
     freeze_language_model: bool = False
@@ -347,11 +196,13 @@ class NevaConfig(TransformerConfig, io.IOMixin):
     data_step_fn: Callable = neva_data_step
 
     def __post_init__(self):
+        # pylint: disable=C0115,C0116
         if self.language_transformer_config is not None:
             for attr in MODEL_CONFIG_ATTR:
                 setattr(self, attr, getattr(self.language_transformer_config, attr))
 
     def configure_model(self, tokenizer) -> "MCoreNevaModel":
+        # pylint: disable=C0115,C0116
         self.language_transformer_config.tensor_model_parallel_size = self.tensor_model_parallel_size
         self.language_transformer_config.sequence_parallel = self.sequence_parallel
         self.vision_transformer_config.tensor_model_parallel_size = self.tensor_model_parallel_size
@@ -385,22 +236,6 @@ def configure_model(self, tokenizer) -> "MCoreNevaModel":
         return model
 
 
-class CLIPViTModel(MCoreCLIPViTModel):
-    """CLIP ViT vision model."""
-
-    def forward(
-        self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, num_unused_layers: int = 0
-    ) -> torch.Tensor:
-        if num_unused_layers > 0:
-            unused_layers = self.decoder.layers[-num_unused_layers:]
-            self.decoder.layers = self.decoder.layers[:-num_unused_layers]
-            x = super().forward(x, attention_mask)
-            self.decoder.layers.append(unused_layers)
-            return x
-
-        return super().forward(x, attention_mask)
-
-
 class _get_data_on_this_cp_rank(torch.autograd.Function):
     """Performs sharding for Context Parallelism in THD format
 
@@ -412,6 +247,7 @@ class _get_data_on_this_cp_rank(torch.autograd.Function):
     @staticmethod
     # def forward(ctx, decoder_embeddings, labels, loss_mask, packed_seq_params):
     def forward(ctx, batch, packed_seq_params):
+        # pylint: disable=C0115,C0116
         cp_size = ps.get_context_parallel_world_size()
         if cp_size > 1:
             try:
@@ -436,6 +272,7 @@ def forward(ctx, batch, packed_seq_params):
 
     @staticmethod
     def backward(ctx, grad_out, grad_label, grad_loss):
+        # pylint: disable=C0115,C0116
         seqlen = ctx.decoder_emb_seqlen
         index = ctx.decoder_emb_index
         assert grad_out.size(1) == index.size(
@@ -455,6 +292,8 @@ def backward(ctx, grad_out, grad_label, grad_loss):
 
 
 class MCoreNevaModel(MCoreLLaVAModel):
+    """Neva Model Base Model Class"""
+
     def __init__(
         self,
         config: NevaConfig,
@@ -463,8 +302,9 @@ def __init__(
         post_process: bool = True,
         add_encoder: bool = True,
         add_decoder: bool = True,
-        drop_vision_class_token: bool = False,
+        drop_vision_class_token: bool = True,
     ) -> None:
+        # pylint: disable=C0115,C0116
         super(MCoreLLaVAModel, self).__init__(config=config)
 
         language_transformer_config = config.language_transformer_config
@@ -497,16 +337,9 @@ def __init__(
             self.share_embeddings_and_output_weights = self.language_model.share_embeddings_and_output_weights
             self._language_max_sequence_length = self.language_model.max_sequence_length
             self._language_is_pipeline_parallel = language_transformer_config.pipeline_model_parallel_size > 1
-            if config.language_model_from_pretrained is not None:
-                sharded_state_dict = dict(state_dict=self.language_model.sharded_state_dict(prefix="module."))
-                loaded_state_dict = dist_checkpointing.load(
-                    sharded_state_dict=sharded_state_dict,
-                    checkpoint_dir=ckpt_to_weights_subdir(config.language_model_from_pretrained, is_saving=False),
-                    validate_access_integrity=False,
-                )
-                loaded_state_dict = {k.removeprefix("module."): v for k, v in loaded_state_dict["state_dict"].items()}
-                self.language_model.load_state_dict(loaded_state_dict)
-                logging.info(f"Restored language model weights from {config.language_model_from_pretrained}")
+            restore_model_weights(self.language_model, config.language_model_from_pretrained)
+            logging.info(f"Restored language model weights from {config.language_model_from_pretrained}")
+
         else:
             if config.language_model_from_pretrained is not None:
                 dist_checkpointing.load(
@@ -519,6 +352,8 @@ def __init__(
             self.vision_model = vision_transformer_config.configure_model()
             self.vision_projection = vision_projection_config.configure_model()
             self._drop_vision_class_token = drop_vision_class_token
+            restore_model_weights(self.vision_model, config.vision_model_from_pretrained)
+            logging.info(f"Restored vision model weights from {config.vision_model_from_pretrained}")
 
         self.freeze(
             freeze_language_model=config.freeze_language_model,
@@ -532,6 +367,8 @@ def __init__(
 
         self.vision_model_from_hf = hasattr(vision_transformer_config, "image_size")
         self._img_seq_len = vision_transformer_config.num_image_embeddings_per_tile
+        if drop_vision_class_token and vision_transformer_config.add_class_token:
+            self._img_seq_len -= vision_transformer_config.class_token_len
 
     def forward(
         self,
@@ -548,6 +385,7 @@ def forward(
         image_token_mask: Optional[torch.Tensor] = None,
         packed_seq_params: Optional[PackedSeqParams] = None,
     ) -> torch.Tensor:
+        # pylint: disable=C0301
         """Forward function of the LLaVA model.
 
         Args:
@@ -1018,6 +856,8 @@ def _process_embedding_token_parallel(self, combined_embeddings, new_labels, new
 
 
 class NevaModel(L.LightningModule, io.IOMixin, io.ConnectorMixin, fn.FNMixin):
+    """Lightning Wrapper for Neva Model"""
+
     def __init__(
         self,
         config: NevaConfig,
@@ -1035,6 +875,7 @@ def __init__(
         self._validation_loss_reduction = None
 
     def configure_model(self) -> None:
+        # pylint: disable=C0115,C0116
         if not hasattr(self, "module"):
             self.module = self.config.configure_model(self.tokenizer)
 
@@ -1053,6 +894,7 @@ def forward(
         image_token_mask: Optional[torch.Tensor] = None,
         packed_seq_params: Optional[PackedSeqParams] = None,
     ) -> torch.Tensor:
+        # pylint: disable=C0115,C0116
         output_tensor = self.module(
             images=images,
             input_ids=input_ids,
@@ -1071,22 +913,27 @@ def forward(
         return output_tensor
 
     def data_step(self, dataloader_iter) -> Dict[str, torch.Tensor]:
+        # pylint: disable=C0115,C0116
         return self.config.data_step_fn(dataloader_iter)
 
     def forward_step(self, batch) -> torch.Tensor:
+        # pylint: disable=C0115,C0116
         return self.config.forward_step_fn(self, batch)
 
     def training_step(self, batch, batch_idx=None) -> torch.Tensor:
+        # pylint: disable=C0115,C0116
         # In mcore the loss-function is part of the forward-pass (when labels are provided)
         return self.forward_step(batch)
 
     def validation_step(self, batch, batch_idx=None) -> torch.Tensor:
+        # pylint: disable=C0115,C0116
         # In mcore the loss-function is part of the forward-pass (when labels are provided)
 
         return self.forward_step(batch)
 
     @property
     def training_loss_reduction(self) -> MaskedTokenLossReductionWithLossMask:
+        # pylint: disable=C0115,C0116
         if not self._training_loss_reduction:
             self._training_loss_reduction = MaskedTokenLossReductionWithLossMask()
 
@@ -1094,6 +941,7 @@ def training_loss_reduction(self) -> MaskedTokenLossReductionWithLossMask:
 
     @property
     def validation_loss_reduction(self) -> MaskedTokenLossReductionWithLossMask:
+        # pylint: disable=C0115,C0116
         if not self._validation_loss_reduction:
             self._validation_loss_reduction = MaskedTokenLossReductionWithLossMask(validation_step=True)
 
diff --git a/nemo/collections/vlm/neva/model/llava.py b/nemo/collections/vlm/neva/model/llava.py
index 9f1b3f857b87..9f36ad7ac19a 100644
--- a/nemo/collections/vlm/neva/model/llava.py
+++ b/nemo/collections/vlm/neva/model/llava.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# pylint: disable=line-too-long
 
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -22,12 +23,11 @@
 
 from nemo.collections.llm import Llama2Config7B, Llama2Config13B, LlamaConfig
 from nemo.collections.llm.utils import Config
-from nemo.collections.vlm.neva.model.base import HFCLIPVisionConfig, MultimodalProjectorConfig, NevaConfig, NevaModel
+from nemo.collections.vlm.neva.model.base import NevaConfig, NevaModel
+from nemo.collections.vlm.vision.base import HFCLIPVisionConfig, MultimodalProjectorConfig
 from nemo.lightning import OptimizerModule, io, teardown
 
 if TYPE_CHECKING:
-    from transformers import LlavaConfig as HFLlavaConfig
-    from transformers import LlavaForConditionalGeneration
 
     from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
     from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
@@ -39,12 +39,16 @@
 
 @dataclass
 class LlavaConfig(NevaConfig):
+    """Llava Model Base Config"""
+
     drop_vision_class_token: bool = True
     freeze_vision_model: bool = True
 
 
 @dataclass
 class Llava15Config7B(LlavaConfig):
+    """Llava v1.5 Config 7B"""
+
     from transformers import PretrainedConfig
 
     language_transformer_config: TransformerConfig = field(default_factory=lambda: Llama2Config7B())
@@ -58,6 +62,8 @@ class Llava15Config7B(LlavaConfig):
 
 @dataclass
 class Llava15Config13B(LlavaConfig):
+    """Llava v1.5 Config 13B"""
+
     from transformers import PretrainedConfig
 
     language_transformer_config: TransformerConfig = field(default_factory=lambda: Llama2Config13B())
@@ -70,6 +76,8 @@ class Llava15Config13B(LlavaConfig):
 
 
 class LlavaModel(NevaModel):
+    """Llava Model NeMo Wrapper"""
+
     def __init__(
         self,
         config: Annotated[Optional[LlavaConfig], Config[LlavaConfig]] = None,
@@ -82,10 +90,14 @@ def __init__(
 
 @io.model_importer(LlavaModel, "hf")
 class HFLlavaImporter(io.ModelConnector["LlavaForConditionalGeneration", LlavaModel]):
+    """Llava Model HF Importer"""
+
     def init(self) -> LlavaModel:
+        # pylint: disable=C0115,C0116
         return LlavaModel(self.config, tokenizer=self.tokenizer)
 
     def apply(self, output_path: Path) -> Path:
+        # pylint: disable=C0115,C0116
         from transformers import LlavaForConditionalGeneration
 
         source = LlavaForConditionalGeneration.from_pretrained(str(self))
@@ -104,6 +116,7 @@ def apply(self, output_path: Path) -> Path:
         return output_path
 
     def convert_state(self, source, target, image_newline=False):
+        # pylint: disable=C0115,C0116
         mapping = {
             "language_model.model.embed_tokens.weight": "language_model.embedding.word_embeddings.weight",
             "language_model.model.layers.*.self_attn.o_proj.weight": "language_model.decoder.layers.*.self_attention.linear_proj.weight",
@@ -179,18 +192,21 @@ def convert_state(self, source, target, image_newline=False):
 
     @property
     def tokenizer(self) -> "AutoTokenizer":
+        # pylint: disable=C0115,C0116
         from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 
         return AutoTokenizer(str(self))
 
     @property
     def config(self) -> LlavaConfig:
+        # pylint: disable=C0115,C0116
         from transformers import LlavaConfig as HFLlavaConfig
 
         source = HFLlavaConfig.from_pretrained(str(self))
         text_conifg = source.text_config
 
         def make_vocab_size_divisible_by(vocab_size):
+            # pylint: disable=C0115,C0116
             base = 128
             while vocab_size % base != 0:
                 base //= 2
@@ -225,6 +241,7 @@ def make_vocab_size_divisible_by(vocab_size):
 
 
 def import_qkv(q, k, v, head_num, num_query_groups, heads_per_group, hidden_size, head_size):
+    # pylint: disable=C0115,C0116
     old_tensor_shape = q.size()
     new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
     new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
@@ -258,6 +275,7 @@ def import_qkv(q, k, v, head_num, num_query_groups, heads_per_group, hidden_size
     target_key="language_model.decoder.layers.*.self_attention.linear_qkv.weight",
 )
 def _import_language_qkv(ctx: io.TransformCTX, q, k, v):
+    # pylint: disable=C0115,C0116
     megatron_config = ctx.target.config.language_transformer_config
     return import_qkv(
         q,
@@ -280,6 +298,7 @@ def _import_language_qkv(ctx: io.TransformCTX, q, k, v):
     target_key="vision_model.decoder.layers.*.self_attention.linear_qkv.weight",
 )
 def _import_vision_qkv(ctx: io.TransformCTX, q, k, v):
+    # pylint: disable=C0115,C0116
     megatron_config = ctx.target.config.vision_transformer_config
     return import_qkv(
         q,
@@ -302,6 +321,7 @@ def _import_vision_qkv(ctx: io.TransformCTX, q, k, v):
     target_key="vision_model.decoder.layers.*.self_attention.linear_qkv.bias",
 )
 def _import_vision_qkv_bias(ctx: io.TransformCTX, q_bias, k_bias, v_bias):
+    # pylint: disable=C0115,C0116
     megatron_config = ctx.target.config.vision_transformer_config
     return import_qkv(
         q_bias.unsqueeze(-1),
@@ -320,6 +340,7 @@ def _import_vision_qkv_bias(ctx: io.TransformCTX, q_bias, k_bias, v_bias):
     target_key="vision_model.class_token",
 )
 def _import_cls_token(ctx: io.TransformCTX, cls_token):
+    # pylint: disable=C0115,C0116
     return cls_token.reshape(1, 1, -1)
 
 
@@ -331,4 +352,5 @@ def _import_cls_token(ctx: io.TransformCTX, cls_token):
     target_key="language_model.decoder.layers.*.mlp.linear_fc1.weight",
 )
 def _import_linear_fc1(down, gate):
+    # pylint: disable=C0115,C0116
     return torch.cat((down, gate), axis=0)
diff --git a/nemo/collections/vlm/neva/model/vit_config.py b/nemo/collections/vlm/neva/model/vit_config.py
deleted file mode 100644
index 5d60a84313ca..000000000000
--- a/nemo/collections/vlm/neva/model/vit_config.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-
-from nemo.collections.llm.fn.activation import openai_gelu, quick_gelu
-
-from nemo.collections.vlm.neva.model.base import CLIPViTConfig
-
-
-@dataclass
-class CLIPViTL_14_336_Config(CLIPViTConfig):
-    """Clip vit large patch14 config"""
-
-    vision_model_type = "clip"
-    patch_dim = 14
-    img_h = 336
-    img_w = 336
-    num_layers = 24
-    num_attention_heads = 16
-    add_bias_linear = True
-    add_qkv_bias = True
-    hidden_size = 1024
-    hidden_dropout = 0.0
-    attention_dropout = 0.0
-    ffn_hidden_size = 4096
-    gated_linear_unit = False
-    activation_func = quick_gelu
-    kv_channels = 64
-    num_query_groups = 16
-    layernorm_zero_centered_gamma = False
-    apply_query_key_layer_scaling = False
-    bias_activation_fusion = False
-    bias_dropout_fusion = False
-    attention_softmax_in_fp32 = True
-    normalization = 'LayerNorm'
-    apply_rope_fusion = False
-
-
-@dataclass
-class SigLIPViT400M_14_384_Config(CLIPViTConfig):
-    """Siglip so400m patch14 384 config"""
-
-    vision_model_type = "siglip"
-    patch_dim = 14
-    img_h = 384
-    img_w = 384
-    num_layers = 27
-    num_attention_heads = 16
-    add_bias_linear = True
-    add_qkv_bias = True
-    hidden_size = 1152
-    hidden_dropout = 0.0
-    attention_dropout = 0.0
-    ffn_hidden_size = 4304
-    gated_linear_unit = False
-    activation_func = openai_gelu
-    kv_channels = 72
-    num_query_groups = 16
-    layernorm_zero_centered_gamma = False
-    apply_query_key_layer_scaling = False
-    bias_activation_fusion = False
-    bias_dropout_fusion = False
-    attention_softmax_in_fp32 = True
-    normalization = 'LayerNorm'
-    apply_rope_fusion = False
-    qk_layernorm = False
-    layernorm_epsilon = 1e-6
diff --git a/nemo/collections/vlm/vision/__init__.py b/nemo/collections/vlm/vision/__init__.py
new file mode 100644
index 000000000000..0391550aa3c6
--- /dev/null
+++ b/nemo/collections/vlm/vision/__init__.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.collections.vlm.vision.base import (
+    BaseCLIPViTModel,
+    CLIPViTConfig,
+    HFCLIPVisionConfig,
+    MultimodalProjectorConfig,
+)
+from nemo.collections.vlm.vision.clip_vit import CLIPViTL_14_336_Config
+from nemo.collections.vlm.vision.intern_vit import (
+    InternViT_6B_448px_Config,
+    InternViT_300M_448px_Config,
+    InternViTModel,
+)
+from nemo.collections.vlm.vision.siglip_vit import SigLIPViT400M_14_384_Config, SigLIPViTModel
+
+__all__ = [
+    "MultimodalProjectorConfig",
+    "HFCLIPVisionConfig",
+    "CLIPViTConfig",
+    "BaseCLIPViTModel",
+    "CLIPViTL_14_336_Config",
+    "SigLIPViTModel",
+    "SigLIPViT400M_14_384_Config",
+    "InternViTModel",
+    "InternViT_300M_448px_Config",
+    "InternViT_6B_448px_Config",
+]
diff --git a/nemo/collections/vlm/vision/base.py b/nemo/collections/vlm/vision/base.py
new file mode 100644
index 000000000000..6313d4c7b626
--- /dev/null
+++ b/nemo/collections/vlm/vision/base.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import re
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import torch.distributed
+import torch.nn.functional as F
+from megatron.core.models.vision.clip_vit_model import CLIPViTModel as MCoreCLIPViTModel
+from megatron.core.models.vision.multimodal_projector import MultimodalProjector as MCoreMultimodalProjector
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TEColumnParallelLinear,
+    TENorm,
+    TERowParallelLinear,
+)
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_config import TransformerConfig
+from transformers import CLIPVisionConfig, CLIPVisionModel
+
+from nemo.lightning import io
+
+
+def set_input_tensor(self, tensor):
+    """Sets input tensor func place holder"""
+    pass
+
+
+def get_image_sequence_length(img_h, img_w, patch_dim, add_class_token, class_token_len):
+    """Get image sequence length given image size, patch size, and class token."""
+    num_patches_per_dim_h = img_h // patch_dim
+    num_patches_per_dim_w = img_w // patch_dim
+    num_patches = num_patches_per_dim_h * num_patches_per_dim_w
+    return num_patches + (class_token_len if add_class_token else 0)
+
+
+@dataclass
+class MultimodalProjectorConfig(TransformerConfig, io.IOMixin):
+    """
+    For MLP, fc1 in shape of input_size, ffn_hidden_size, fc2 in shape of ffn_hidden_size, hidden_size
+    """
+
+    projector_type: str = "mlp2x_gelu"
+    layer_spec: Optional[MLPSubmodules] = None
+    input_size: Optional[int] = 1024
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 1024
+    activation_func: Callable = F.gelu
+    bias: bool = True
+    bias_activation_fusion: bool = True
+    num_layers: int = 1  # placeholder, NOT used!
+    num_attention_heads: int = 8  # placeholder, NOT used!
+
+    def configure_model(self) -> "MCoreMultimodalProjector":
+        # pylint: disable=C0115,C0116
+        if self.projector_type.startswith("mcore") and self.layer_spec is None:
+            if self.projector_type == "mcore_mlp":
+                self.projector_type = "mlp"  # strip "mcore_" for mcore init
+                self.layer_spec = ModuleSpec(
+                    module=MLP,
+                    submodules=MLPSubmodules(
+                        linear_fc1=TEColumnParallelLinear,
+                        linear_fc2=TERowParallelLinear,
+                    ),
+                )
+                self.layer_spec = self.layer_spec.submodules
+            elif self.projector_type == "mcore_affine":
+                self.projector_type = "affine"  # strip "mcore_" for mcore init
+                self.layer_spec = MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=None)
+            else:
+                raise NotImplementedError(f"Not supported projector type `{self.projector_type}`")
+
+            return MCoreMultimodalProjector(
+                self,
+                self.layer_spec,
+                projector_type=self.projector_type,
+                input_size=self.input_size,
+            )
+
+        # e.g. "mlp2x_gelu"
+        mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', self.projector_type)
+        if mlp_gelu_match:
+            mlp_depth = int(mlp_gelu_match.group(1))
+            modules = [torch.nn.Linear(self.input_size, self.hidden_size, bias=True)]
+            for _ in range(1, mlp_depth):
+                modules.append(torch.nn.GELU())
+                modules.append(torch.nn.Linear(self.hidden_size, self.hidden_size, bias=True))
+            model = torch.nn.Sequential(*modules)
+            from types import MethodType
+
+            model.set_input_tensor = MethodType(set_input_tensor, model)
+        else:
+            raise NotImplementedError(f"Not supported projector type `{self.projector_type}`")
+
+        return model
+
+
+@dataclass
+class HFCLIPVisionConfig(CLIPVisionConfig, io.IOMixin):
+    """
+    https://github.com/huggingface/transformers/blob/v4.44.0/src/transformers/models/clip/configuration_clip.py#L261
+    """
+
+    hidden_size: int = 1024
+    add_class_token: bool = False
+    class_token_len: int = 1
+    num_image_embeddings_per_tile: Optional[int] = None
+    pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None
+
+    def __post_init__(self, *args, **kwargs) -> None:
+        # pylint: disable=C0115,C0116
+        CLIPVisionConfig.__init__(self, *args, **kwargs, hidden_size=self.hidden_size)
+        if self.pretrained_model_name_or_path is not None:
+            config = CLIPVisionConfig.from_pretrained(self.pretrained_model_name_or_path)
+            for key, value in config.to_dict().items():
+                setattr(self, key, value)
+        self.num_image_embeddings_per_tile = get_image_sequence_length(
+            img_h=self.image_size,
+            img_w=self.image_size,
+            patch_dim=self.patch_size,
+            add_class_token=self.add_class_token,
+            class_token_len=self.class_token_len,
+        )
+
+    def configure_model(self) -> "CLIPVisionModel":
+        # pylint: disable=C0115,C0116
+        # Monkey patch the method to the vision encoder
+        CLIPVisionModel.set_input_tensor = set_input_tensor
+
+        if self.pretrained_model_name_or_path is None:
+            model = CLIPVisionModel(self)
+        else:
+            model = CLIPVisionModel.from_pretrained(self.pretrained_model_name_or_path)
+        return model
+
+
+@dataclass
+class CLIPViTConfig(TransformerConfig, io.IOMixin):
+    """MCore CLIP ViT Config"""
+
+    ln_pre_impl: Union[ModuleSpec, type] = TENorm
+    ln_post_impl: Union[ModuleSpec, type] = TENorm
+    add_class_token: bool = True
+    class_token_len: int = 1
+    patch_dim: int = 14
+    img_h: int = 336
+    img_w: int = 336
+    vision_model_type: str = "clip"  # ["clip", "siglip", "internvit"]
+    num_image_embeddings_per_tile: Optional[int] = None
+    transformer_layer_spec: ModuleSpec = None
+
+    num_layers: int = 1  # Placeholder, NOT used!
+    num_attention_heads: int = 8  # Placeholder, NOT used!
+
+    def __post_init__(self):
+        # pylint: disable=C0115,C0116
+        if self.vision_model_type == "siglip":
+            self.add_class_token = False
+            self.class_token_len = 0
+        self.num_image_embeddings_per_tile = get_image_sequence_length(
+            img_h=self.img_h,
+            img_w=self.img_w,
+            patch_dim=self.patch_dim,
+            add_class_token=self.add_class_token,
+            class_token_len=self.class_token_len,
+        )
+
+    def configure_model(self) -> "BaseCLIPViTModel":
+        # pylint: disable=C0115,C0116
+        transformer_layer_spec = self.transformer_layer_spec
+        if not isinstance(transformer_layer_spec, ModuleSpec):
+            from nemo.collections.vlm.layer_specs import get_layer_spec_te
+
+            transformer_layer_spec = get_layer_spec_te(is_vit=True)
+        return BaseCLIPViTModel(
+            self,
+            transformer_layer_spec,
+            ln_pre_impl=self.ln_pre_impl,
+            ln_post_impl=self.ln_post_impl,
+            add_class_token=self.add_class_token,
+            class_token_len=self.class_token_len,
+            patch_dim=self.patch_dim,
+            img_h=self.img_h,
+            img_w=self.img_w,
+            model_subtype=self.vision_model_type,
+        )
+
+
+class BaseCLIPViTModel(MCoreCLIPViTModel):
+    """CLIP ViT vision model."""
+
+    def forward(
+        self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, num_unused_layers: int = 0
+    ) -> torch.Tensor:
+        # pylint: disable=C0115,C0116
+        if num_unused_layers > 0:
+            unused_layers = self.decoder.layers[-num_unused_layers:]
+            self.decoder.layers = self.decoder.layers[:-num_unused_layers]
+            x = super().forward(x, attention_mask)
+            self.decoder.layers.extend(unused_layers)
+            return x
+
+        return super().forward(x, attention_mask)
diff --git a/nemo/collections/vlm/vision/clip_vit.py b/nemo/collections/vlm/vision/clip_vit.py
new file mode 100644
index 000000000000..6134aed22a4e
--- /dev/null
+++ b/nemo/collections/vlm/vision/clip_vit.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=C0301
+
+from dataclasses import dataclass
+from pathlib import Path
+
+import lightning.pytorch as L
+import torch
+
+from nemo.collections.llm.fn.activation import quick_gelu
+from nemo.collections.vlm.vision.base import CLIPViTConfig
+from nemo.lightning import io, teardown
+
+
+@dataclass
+class CLIPViTL_14_336_Config(CLIPViTConfig):
+    """Clip vit large patch14 config"""
+
+    vision_model_type: str = "clip"
+    patch_dim: int = 14
+    img_h: int = 336
+    img_w: int = 336
+    num_layers: int = 24
+    num_attention_heads: int = 16
+    add_bias_linear: bool = True
+    add_qkv_bias: bool = True
+    hidden_size: int = 1024
+    hidden_dropout: float = 0.0
+    attention_dropout: float = 0.0
+    ffn_hidden_size: int = 4096
+    gated_linear_unit: bool = False
+    activation_func: callable = quick_gelu
+    kv_channels: int = 64
+    num_query_groups: int = 16
+    layernorm_zero_centered_gamma: bool = False
+    apply_query_key_layer_scaling: bool = False
+    bias_activation_fusion: bool = False
+    bias_dropout_fusion: bool = False
+    attention_softmax_in_fp32: bool = True
+    normalization: str = 'LayerNorm'
+    apply_rope_fusion: bool = False
+
+
+class CLIPViTModel(L.LightningModule, io.IOMixin, io.ConnectorMixin):
+    """CLIP ViT Model Wrapper"""
+
+    def __init__(self, config):
+        # pylint: disable=C0115,C0116
+        super().__init__()
+        self.config = config
+
+    def configure_model(self) -> None:
+        # pylint: disable=C0115,C0116
+        if not hasattr(self, "module"):
+            self.module = self.config.configure_model()
+
+
+@io.model_importer(CLIPViTModel, "hf")
+class CLIPViTImporter(io.ModelConnector["CLIPVisionModel", CLIPViTModel]):
+    """CLIP HF Importer"""
+
+    def init(self) -> CLIPViTModel:
+        # pylint: disable=C0115,C0116
+        return CLIPViTModel(self.config)
+
+    def apply(self, output_path: Path) -> Path:
+        # pylint: disable=C0115,C0116
+        from transformers import AutoModel
+
+        source = AutoModel.from_pretrained(str(self), trust_remote_code=True)
+        target = self.init()
+        trainer = self.nemo_setup(target)
+
+        self.convert_state(source, target)
+        self.nemo_save(output_path, trainer)
+
+        print(f"Converted CLIPViT model saved to {output_path}")
+
+        teardown(trainer, target)
+        del trainer, target
+
+        return output_path
+
+    @property
+    def config(self) -> CLIPViTConfig:
+        # pylint: disable=C0115,C0116
+        from transformers import AutoConfig
+
+        source = AutoConfig.from_pretrained(str(self), trust_remote_code=True)
+
+        output = CLIPViTL_14_336_Config(
+            patch_dim=source.vision_config.patch_size,
+            hidden_size=source.vision_config.hidden_size,
+            img_h=source.vision_config.image_size,
+            img_w=source.vision_config.image_size,
+            ffn_hidden_size=source.vision_config.intermediate_size,
+            num_attention_heads=source.vision_config.num_attention_heads,
+            num_layers=source.vision_config.num_hidden_layers,
+            kv_channels=int(source.vision_config.hidden_size / source.vision_config.num_attention_heads),
+            num_query_groups=source.vision_config.num_attention_heads,
+        )
+        return output
+
+    def convert_state(self, source, target):
+        # pylint: disable=C0115,C0116
+        mapping = {}
+        mapping.update(
+            {
+                # "vision_model.embeddings.class_embedding": "class_token",
+                "vision_model.embeddings.patch_embedding.weight": "conv1.weight",
+                "vision_model.embeddings.position_embedding.weight": "position_embeddings.weight",
+                "vision_model.pre_layrnorm.weight": "ln_pre.weight",
+                "vision_model.pre_layrnorm.bias": "ln_pre.bias",
+                "vision_model.encoder.layers.*.self_attn.out_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight",
+                "vision_model.encoder.layers.*.self_attn.out_proj.bias": "decoder.layers.*.self_attention.linear_proj.bias",
+                "vision_model.encoder.layers.*.layer_norm1.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
+                "vision_model.encoder.layers.*.layer_norm1.bias": "decoder.layers.*.self_attention.linear_qkv.layer_norm_bias",
+                "vision_model.encoder.layers.*.mlp.fc1.weight": "decoder.layers.*.mlp.linear_fc1.weight",
+                "vision_model.encoder.layers.*.mlp.fc1.bias": "decoder.layers.*.mlp.linear_fc1.bias",
+                "vision_model.encoder.layers.*.mlp.fc2.weight": "decoder.layers.*.mlp.linear_fc2.weight",
+                "vision_model.encoder.layers.*.mlp.fc2.bias": "decoder.layers.*.mlp.linear_fc2.bias",
+                "vision_model.encoder.layers.*.layer_norm2.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
+                "vision_model.encoder.layers.*.layer_norm2.bias": "decoder.layers.*.mlp.linear_fc1.layer_norm_bias",
+            }
+        )
+
+        return io.apply_transforms(
+            source,
+            target,
+            mapping=mapping,
+            transforms=[
+                _import_cls_token,
+                _import_vision_qkv_bias,
+                _import_vision_qkv,
+            ],
+        )
+
+
+def import_qkv(q, k, v, head_num, num_query_groups, heads_per_group, hidden_size, head_size):
+    # pylint: disable=C0115,C0116
+    old_tensor_shape = q.size()
+    new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+    new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
+
+    q = q.view(*new_q_tensor_shape)
+    k = k.view(*new_kv_tensor_shape)
+    v = v.view(*new_kv_tensor_shape)
+
+    qkv_weights_l = []
+    for i in range(num_query_groups):
+        qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :])
+        qkv_weights_l.append(k[i : i + 1, :, :])
+        qkv_weights_l.append(v[i : i + 1, :, :])
+    qkv_weights = torch.cat(qkv_weights_l)
+    assert qkv_weights.ndim == 3, qkv_weights.shape
+    assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape
+    assert qkv_weights.shape[1] == head_size, qkv_weights.shape
+    assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape
+
+    qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+
+    return qkv_weights
+
+
+@io.state_transform(
+    source_key=(
+        "vision_model.encoder.layers.*.self_attn.q_proj.bias",
+        "vision_model.encoder.layers.*.self_attn.k_proj.bias",
+        "vision_model.encoder.layers.*.self_attn.v_proj.bias",
+    ),
+    target_key="decoder.layers.*.self_attention.linear_qkv.bias",
+)
+def _import_vision_qkv_bias(ctx: io.TransformCTX, q_bias, k_bias, v_bias):
+    # pylint: disable=C0115,C0116
+    megatron_config = ctx.target.config
+    return import_qkv(
+        q_bias.unsqueeze(-1),
+        k_bias.unsqueeze(-1),
+        v_bias.unsqueeze(-1),
+        head_num=megatron_config.num_attention_heads,
+        num_query_groups=megatron_config.num_query_groups,
+        heads_per_group=megatron_config.num_attention_heads // megatron_config.num_query_groups,
+        hidden_size=1,
+        head_size=megatron_config.kv_channels,
+    ).squeeze(-1)
+
+
+@io.state_transform(
+    source_key=(
+        "vision_model.encoder.layers.*.self_attn.q_proj.weight",
+        "vision_model.encoder.layers.*.self_attn.k_proj.weight",
+        "vision_model.encoder.layers.*.self_attn.v_proj.weight",
+    ),
+    target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+)
+def _import_vision_qkv(ctx: io.TransformCTX, q, k, v):
+    # pylint: disable=C0115,C0116
+    megatron_config = ctx.target.config
+    return import_qkv(
+        q,
+        k,
+        v,
+        head_num=megatron_config.num_attention_heads,
+        num_query_groups=megatron_config.num_query_groups,
+        heads_per_group=megatron_config.num_attention_heads // megatron_config.num_query_groups,
+        hidden_size=megatron_config.hidden_size,
+        head_size=megatron_config.kv_channels,
+    )
+
+
+@io.state_transform(
+    source_key=("vision_model.embeddings.class_embedding",),
+    target_key="class_token",
+)
+def _import_cls_token(ctx: io.TransformCTX, cls_token):
+    # pylint: disable=C0115,C0116
+    return cls_token.reshape(1, 1, -1)
diff --git a/nemo/collections/vlm/vision/intern_vit.py b/nemo/collections/vlm/vision/intern_vit.py
new file mode 100644
index 000000000000..4364174037bf
--- /dev/null
+++ b/nemo/collections/vlm/vision/intern_vit.py
@@ -0,0 +1,535 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from functools import partial
+from pathlib import Path
+from typing import Callable
+
+import lightning.pytorch as L
+import torch
+from megatron.core.extensions.transformer_engine import (
+    TEColumnParallelLinear,
+    TEDotProductAttention,
+    TENorm,
+    TERowParallelLinear,
+)
+from megatron.core.parallel_state import (
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+
+from nemo.collections.vlm.vision.base import CLIPViTConfig
+from nemo.lightning import io, teardown
+
+
+class InternViTRMSNorm(torch.nn.Module):
+    """Customized Version of RMSNorm"""
+
+    def __init__(
+        self,
+        config,
+        hidden_size: int,
+        eps: float = 1e-6,
+        sequence_parallel: bool = False,
+        compute_var: bool = False,
+    ):
+        """Custom RMSNorm for InternViT.
+
+        Args:
+            config (TransformerConfig): Config.
+            hidden_size (int): Input hidden size.
+            eps (float): epsilon to use for the norm, default to 1e-6
+            sequence_parallel (bool): Set to true if sequence parallelism is being used,
+              this marks the weights as needing to be allreduced.
+            compute_var (bool): Indicator to compute statistic manually.
+        """
+        super().__init__()
+        self.config = config
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.ones(hidden_size))
+        self._compute_var = compute_var
+
+        assert not sequence_parallel, "Sequence parallelism is not supported with InternViT."
+
+        setattr(self.weight, 'sequence_parallel', sequence_parallel)
+
+    def _norm(self, x, var):
+        """RMSNorm"""
+        if var is None:
+            var = x.pow(2).mean(-1, keepdim=True)
+
+        return x * torch.rsqrt(var + self.eps)
+
+    def forward(self, x):
+        """Run RMSNorm with an option to compute custom statistic."""
+        var = None
+        if self._compute_var:
+            unpadded_hidden_size = self.config.hidden_size  # 3200
+            max_dim = x.shape[-1]  # 128
+            x = x.reshape(x.size(0), x.size(1), -1)
+            total_heads = x.shape[-1] // max_dim
+            if max_dim == 128:
+                valid_heads = 25
+            elif max_dim == 64:
+                valid_heads = 16
+            else:
+                raise ValueError("Cannot infer number of heads.")
+            var = self._gather_var(x.float().pow(2), max_dim, valid_heads, total_heads) / unpadded_hidden_size
+
+        output = self._norm(x.float(), var).type_as(x)
+        output = output * self.weight
+
+        if self._compute_var:
+            output = output.reshape(output.size(0), output.size(1), -1, max_dim)
+
+        return output
+
+    def _gather_var(self, input_, max_dim, valid_heads, total_heads):
+        """Compute statistic across the non-dummy heads."""
+        world_size = get_tensor_model_parallel_world_size()
+        rank = get_tensor_model_parallel_rank()
+
+        heads_per_rank = total_heads // world_size
+        valid_heads_per_rank = []
+        remaining_heads = valid_heads
+        for _ in range(world_size):
+            if remaining_heads >= heads_per_rank:
+                valid_heads_per_rank.append(heads_per_rank)
+            else:
+                valid_heads_per_rank.append(remaining_heads)
+            remaining_heads -= heads_per_rank
+
+        # Size and dimension.
+        last_dim = input_.dim() - 1
+
+        valid_dim = max_dim * valid_heads_per_rank[rank]
+        if valid_dim > 0:
+            var = input_[..., :valid_dim].sum(-1, keepdim=True)
+        else:
+            var = input_.sum(-1, keepdim=True) * 0.0  # Zero-out the dummy heads.
+
+        tensor_list = [torch.empty_like(var) for _ in range(world_size)]
+        tensor_list[rank] = var
+        torch.distributed.all_gather(tensor_list, var, group=get_tensor_model_parallel_group())
+
+        output = torch.cat(tensor_list, dim=last_dim).contiguous()
+
+        return output.sum(-1, keepdim=True)
+
+
+def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
+    # pylint: disable=C0115,C0116
+    # Dense MLP w/ or w/o TE modules.
+    return ModuleSpec(
+        module=MLP,
+        submodules=MLPSubmodules(
+            linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear,
+            linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
+        ),
+    )
+
+
+def _bias_dropout_add_func_internvit(ls, x_with_bias, residual, prob, training):
+    """Handle InternViT's layer scaling."""
+    x, bias = x_with_bias  # unpack
+    residual = residual if residual.dtype == x.dtype else residual.to(x.dtype)
+    if bias is not None:
+        x = x + bias
+        out = torch.nn.functional.dropout(x, p=prob, training=training)
+        out = residual + out * ls
+        return out
+    else:
+        out = torch.nn.functional.dropout(x, p=prob, training=training)
+        out = residual + out * ls
+        return out
+
+
+def bias_dropout_add_unfused_internvit(ls, training):
+    """Bias-dropout-add as in Megatron but with added LayerScaling handling."""
+
+    def _bias_dropout_add(x_with_bias, residual, prob):
+        #
+        return _bias_dropout_add_func_internvit(ls, x_with_bias, residual, prob, training)
+
+    return _bias_dropout_add
+
+
+def get_bias_dropout_add_internvit(ls, training, fused):
+    """Bias-dropout-add as in Megatron but with added LayerScaling handling."""
+    assert not fused, "Fused bias-dropout-add not implemented for InternViT."
+    return bias_dropout_add_unfused_internvit(ls, training)
+
+
+class InternViTTransformerLayer(TransformerLayer):
+    """Add InternViT specialties to our default TransformerLayer."""
+
+    def __init__(self, *args, **kwargs):
+        # pylint: disable=C0115,C0116
+        super().__init__(*args, **kwargs)
+        self.ls1 = torch.nn.Parameter(torch.ones(self.config.hidden_size))
+        self.ls2 = torch.nn.Parameter(torch.ones(self.config.hidden_size))
+
+        self.self_attn_bda = partial(self.self_attn_bda, self.ls1)
+        self.mlp_bda = partial(self.mlp_bda, self.ls2)
+
+
+class InternViTSelfAttention(SelfAttention):
+    """Override a few things that are special in InternViT and not supported by the SelfAttention class."""
+
+    def __init__(self, config: TransformerConfig, submodules: SelfAttentionSubmodules, *args, **kwargs):
+        # pylint: disable=C0115,C0116
+        super().__init__(config=config, submodules=submodules, *args, **kwargs)
+
+        # Need to override linear_qkv, q_layernorm and k_layernorm.
+        qkv_bias = self.config.add_qkv_bias
+
+        self.linear_qkv = build_module(
+            submodules.linear_qkv,
+            self.config.hidden_size,
+            self.query_projection_size + 2 * self.kv_projection_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=qkv_bias,
+            skip_bias_add=False,
+            is_expert=False,
+            tp_comm_buffer_name='qkv',
+        )
+
+        qk_layernorm_hidden_size = (
+            self.hidden_size_per_attention_head * self.num_attention_heads_per_partition
+        )  # 512 for internvit
+        self.q_layernorm = build_module(
+            submodules.q_layernorm,
+            hidden_size=qk_layernorm_hidden_size,
+            config=self.config,
+            eps=self.config.layernorm_epsilon,
+            compute_var=True,
+        )
+
+        self.k_layernorm = build_module(
+            submodules.k_layernorm,
+            hidden_size=qk_layernorm_hidden_size,
+            config=self.config,
+            eps=self.config.layernorm_epsilon,
+            compute_var=True,
+        )
+
+
+class InternViTTEDotProductAttention(TEDotProductAttention):
+    """Adjusted Attention for InternViT"""
+
+    def forward(self, *args, **kwargs):
+        """Regular TEDotProductAttention + zero-out dummy attention heads."""
+        out = super().forward(*args, **kwargs)
+
+        # This makes sure the dummy attention heads are zeroed out.
+        mask = torch.ones_like(out, dtype=out.dtype, device=out.device)
+        rank = get_tensor_model_parallel_rank()
+        max_dim = out.shape[-1]  # 128
+        valid_ranks = 6
+
+        if rank == valid_ranks:
+            mask[..., max_dim:] *= 0.0
+        elif rank > valid_ranks:
+            mask *= 0.0
+        out *= mask
+
+        return out
+
+
+def get_internvit_layer_spec(use_te, add_qk_norm=True, norm_type="RMSNorm") -> ModuleSpec:
+    """Get InterViT's MCore layer spec"""
+    NORM2FN = {
+        'RMSNorm': InternViTRMSNorm,
+        'LayerNorm': TENorm,
+    }
+
+    mlp = get_mlp_module_spec(use_te)  # no norm
+
+    return ModuleSpec(
+        module=InternViTTransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=NORM2FN[norm_type],
+            self_attention=ModuleSpec(
+                module=InternViTSelfAttention,
+                params={"attn_mask_type": AttnMaskType.no_mask},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear if use_te else ColumnParallelLinear,
+                    core_attention=TEDotProductAttention if use_te else DotProductAttention,
+                    linear_proj=TERowParallelLinear if use_te else RowParallelLinear,
+                    q_layernorm=NORM2FN[norm_type] if add_qk_norm else IdentityOp,
+                    k_layernorm=NORM2FN[norm_type] if add_qk_norm else IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add_internvit,
+            pre_mlp_layernorm=NORM2FN[norm_type],
+            mlp=mlp,
+            mlp_bda=get_bias_dropout_add_internvit,
+        ),
+    )
+
+
+@dataclass
+class InternViTConfig(CLIPViTConfig):
+    """Intern ViT Base Config"""
+
+    vision_model_type: str = "internvit"
+    patch_dim: int = 14
+    img_h: int = 448
+    img_w: int = 448
+    num_layers: int = 45
+    num_attention_heads: int = 25
+    num_query_groups: int = 25
+    kv_channels: int = 128
+    add_bias_linear: bool = True
+    add_qkv_bias: bool = False
+    hidden_size: int = 3200
+    hidden_dropout: float = 0.0
+    attention_dropout: float = 0.0
+    ffn_hidden_size: int = 12800
+    gated_linear_unit: bool = False
+    activation_func: Callable = torch.nn.functional.gelu
+    layernorm_zero_centered_gamma: bool = False
+    apply_query_key_layer_scaling: bool = False
+    bias_activation_fusion: bool = False
+    bias_dropout_fusion: bool = False
+    attention_softmax_in_fp32: bool = True
+    normalization: str = 'RMSNorm'
+    layernorm_epsilon: float = 1e-6
+    apply_rope_fusion: bool = False
+    transformer_layer_spec: ModuleSpec = get_internvit_layer_spec(use_te=True)
+
+
+@dataclass
+class InternViT_6B_448px_Config(InternViTConfig):
+    """Intern ViT 6B Config for >= v1.5"""
+
+    vision_model_type: str = "internvit"
+
+
+@dataclass
+class InternViT_300M_448px_Config(InternViTConfig):
+    """Intern ViT 300M Config for >= v1.5"""
+
+    vision_model_type: str = "internvit"
+    num_layers: int = 24
+    num_attention_heads: int = 16
+    num_query_groups: int = 16
+    kv_channels: int = 64
+    add_bias_linear: bool = True
+    add_qkv_bias: bool = True
+    hidden_size: int = 1024
+    hidden_dropout: float = 0.0
+    attention_dropout: float = 0.0
+    ffn_hidden_size: int = 4096
+    normalization: str = 'LayerNorm'
+    transformer_layer_spec: ModuleSpec = get_internvit_layer_spec(
+        use_te=True,
+        add_qk_norm=False,
+        norm_type='LayerNorm',
+    )
+
+
+class InternViTModel(L.LightningModule, io.IOMixin, io.ConnectorMixin):
+    """InternViT NeMo Wrapper"""
+
+    def __init__(self, config):
+        # pylint: disable=C0115,C0116
+        super().__init__()
+        self.config = config
+
+    def configure_model(self) -> None:
+        # pylint: disable=C0115,C0116
+        if not hasattr(self, "module"):
+            self.module = self.config.configure_model()
+
+
+@io.model_importer(InternViTModel, "hf")
+class HFInternViTImporter(io.ModelConnector["InternVisionModel", InternViTModel]):
+    """HF InternViT Importer"""
+
+    def init(self) -> InternViTModel:
+        # pylint: disable=C0115,C0116
+        return InternViTModel(self.config)
+
+    def apply(self, output_path: Path) -> Path:
+        # pylint: disable=C0115,C0116
+        from transformers import AutoModel
+
+        source = AutoModel.from_pretrained(str(self), trust_remote_code=True)
+        target = self.init()
+        trainer = self.nemo_setup(target)
+
+        self.convert_state(source, target)
+        print(f"Converted InternViT model to Nemo, saving to {output_path}")
+
+        self.nemo_save(output_path, trainer)
+
+        print(f"Converted InternViT model saved to {output_path}")
+
+        teardown(trainer, target)
+        del trainer, target
+
+        return output_path
+
+    def convert_state(self, source, target):
+        # pylint: disable=C0115,C0116
+        mapping = {
+            # Embeddings
+            "embeddings.class_embedding": "class_token",
+            "embeddings.patch_embedding.weight": "conv1.weight",
+            "embeddings.patch_embedding.bias": "conv1.bias",
+            # Transformer Layers
+            "encoder.layers.*.ls1": "decoder.layers.*.ls1",
+            "encoder.layers.*.ls2": "decoder.layers.*.ls2",
+            # Attention QKV
+            "encoder.layers.*.attn.q_norm.weight": "decoder.layers.*.self_attention.q_layernorm.weight",
+            "encoder.layers.*.attn.k_norm.weight": "decoder.layers.*.self_attention.k_layernorm.weight",
+            "encoder.layers.*.attn.proj.weight": "decoder.layers.*.self_attention.linear_proj.weight",
+            "encoder.layers.*.attn.proj.bias": "decoder.layers.*.self_attention.linear_proj.bias",
+            # MLP
+            "encoder.layers.*.mlp.fc1.weight": "decoder.layers.*.mlp.linear_fc1.weight",
+            "encoder.layers.*.mlp.fc1.bias": "decoder.layers.*.mlp.linear_fc1.bias",
+            "encoder.layers.*.mlp.fc2.weight": "decoder.layers.*.mlp.linear_fc2.weight",
+            "encoder.layers.*.mlp.fc2.bias": "decoder.layers.*.mlp.linear_fc2.bias",
+            # Layer Norm
+            "encoder.layers.*.norm1.weight": "decoder.layers.*.input_layernorm.weight",
+            "encoder.layers.*.norm1.bias": "decoder.layers.*.input_layernorm.bias",
+            "encoder.layers.*.norm2.weight": "decoder.layers.*.pre_mlp_layernorm.weight",
+            "encoder.layers.*.norm2.bias": "decoder.layers.*.pre_mlp_layernorm.bias",
+        }
+
+        return io.apply_transforms(
+            source,
+            target,
+            mapping=mapping,
+            transforms=[_import_position_embedding, _import_qkv, _import_qkv_bias],
+        )
+
+    @property
+    def config(self) -> CLIPViTConfig:
+        # pylint: disable=C0115,C0116
+        from transformers import AutoConfig
+
+        source = AutoConfig.from_pretrained(str(self), trust_remote_code=True)
+        norm_type = getattr(source, "norm_type", "RMSNorm")
+        if norm_type == "layer_norm":
+            norm_type = "LayerNorm"
+        output = InternViTConfig(
+            patch_dim=source.patch_size,
+            img_h=source.image_size,
+            img_w=source.image_size,
+            hidden_size=source.hidden_size,
+            ffn_hidden_size=source.intermediate_size,
+            layernorm_epsilon=source.layer_norm_eps,
+            num_attention_heads=source.num_attention_heads,
+            num_query_groups=source.num_attention_heads,
+            kv_channels=source.hidden_size // source.num_attention_heads,
+            add_qkv_bias=source.qkv_bias,
+            num_layers=source.num_hidden_layers,
+            normalization=norm_type,
+            transformer_layer_spec=get_internvit_layer_spec(
+                use_te=True,
+                add_qk_norm=source.qk_normalization,
+                norm_type=norm_type,
+            ),
+        )
+        return output
+
+
+@io.state_transform(
+    source_key="embeddings.position_embedding",
+    target_key="position_embeddings.weight",
+)
+def _import_position_embedding(ctx: io.TransformCTX, pos_emb):
+    # pylint: disable=C0115,C0116
+    return pos_emb.squeeze(0)
+
+
+def import_qkv(q, k, v, head_num, num_query_groups, heads_per_group, hidden_size, head_size):
+    # pylint: disable=C0115,C0116
+    old_tensor_shape = q.size()
+    new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+    new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
+
+    q = q.view(*new_q_tensor_shape)
+    k = k.view(*new_kv_tensor_shape)
+    v = v.view(*new_kv_tensor_shape)
+
+    qkv_weights_l = []
+    for i in range(num_query_groups):
+        qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :])
+        qkv_weights_l.append(k[i : i + 1, :, :])
+        qkv_weights_l.append(v[i : i + 1, :, :])
+    qkv_weights = torch.cat(qkv_weights_l)
+    assert qkv_weights.ndim == 3, qkv_weights.shape
+    assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape
+    assert qkv_weights.shape[1] == head_size, qkv_weights.shape
+    assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape
+
+    qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+
+    return qkv_weights
+
+
+@io.state_transform(
+    source_key="encoder.layers.*.attn.qkv.weight",
+    target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+)
+def _import_qkv(ctx: io.TransformCTX, qkv):
+    # pylint: disable=C0115,C0116
+    megatron_config = ctx.target.config
+    q, k, v = qkv.chunk(3)
+    return import_qkv(
+        q,
+        k,
+        v,
+        head_num=megatron_config.num_attention_heads,
+        num_query_groups=megatron_config.num_query_groups,
+        heads_per_group=megatron_config.num_attention_heads // megatron_config.num_query_groups,
+        hidden_size=megatron_config.hidden_size,
+        head_size=megatron_config.kv_channels,
+    )
+
+
+@io.state_transform(
+    source_key="encoder.layers.*.attn.qkv.bias",
+    target_key="decoder.layers.*.self_attention.linear_qkv.bias",
+)
+def _import_qkv_bias(ctx: io.TransformCTX, qkv_bias):
+    # pylint: disable=C0115,C0116
+    megatron_config = ctx.target.config
+    q_bias, k_bias, v_bias = qkv_bias.chunk(3)
+    return import_qkv(
+        q_bias.unsqueeze(-1),
+        k_bias.unsqueeze(-1),
+        v_bias.unsqueeze(-1),
+        head_num=megatron_config.num_attention_heads,
+        num_query_groups=megatron_config.num_query_groups,
+        heads_per_group=megatron_config.num_attention_heads // megatron_config.num_query_groups,
+        hidden_size=1,
+        head_size=megatron_config.kv_channels,
+    ).squeeze(-1)
diff --git a/nemo/collections/vlm/vision/siglip_vit.py b/nemo/collections/vlm/vision/siglip_vit.py
new file mode 100644
index 000000000000..188cb064c995
--- /dev/null
+++ b/nemo/collections/vlm/vision/siglip_vit.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=C0301
+
+from dataclasses import dataclass
+from pathlib import Path
+
+import lightning.pytorch as L
+import torch
+
+from nemo.collections.llm.fn.activation import openai_gelu
+
+from nemo.collections.vlm.vision.base import CLIPViTConfig
+from nemo.lightning import io, teardown
+
+
+@dataclass
+class SigLIPViT400M_14_384_Config(CLIPViTConfig):
+    """Siglip so400m patch14 384 config"""
+
+    vision_model_type: str = "siglip"
+    patch_dim: int = 14
+    img_h: int = 384
+    img_w: int = 384
+    num_layers: int = 27
+    num_attention_heads: int = 16
+    add_bias_linear: bool = True
+    add_qkv_bias: bool = True
+    hidden_size: int = 1152
+    hidden_dropout: float = 0.0
+    attention_dropout: float = 0.0
+    ffn_hidden_size: int = 4304
+    gated_linear_unit: bool = False
+    activation_func: callable = openai_gelu
+    kv_channels: int = 72
+    num_query_groups: int = 16
+    layernorm_zero_centered_gamma: bool = False
+    apply_query_key_layer_scaling: bool = False
+    bias_activation_fusion: bool = False
+    bias_dropout_fusion: bool = False
+    attention_softmax_in_fp32: bool = True
+    normalization: str = 'LayerNorm'
+    apply_rope_fusion: bool = False
+    qk_layernorm: bool = False
+    layernorm_epsilon: float = 1e-6
+
+
+class SigLIPViTModel(L.LightningModule, io.IOMixin, io.ConnectorMixin):
+    """SigLIP ViT NeMo Wrapper"""
+
+    def __init__(self, config):
+        # pylint: disable=C0115,C0116
+        super().__init__()
+        self.config = config
+
+    def configure_model(self) -> None:
+        # pylint: disable=C0115,C0116
+        if not hasattr(self, "module"):
+            self.module = self.config.configure_model()
+
+
+@io.model_importer(SigLIPViTModel, "hf")
+class SigLIPViTImporter(io.ModelConnector["SigLIPVisionModel", SigLIPViTModel]):
+    """HF SigLIP ViT Importer"""
+
+    def init(self) -> SigLIPViTModel:
+        # pylint: disable=C0115,C0116
+        return SigLIPViTModel(self.config)
+
+    def apply(self, output_path: Path) -> Path:
+        # pylint: disable=C0115,C0116
+        from transformers import AutoModel
+
+        source = AutoModel.from_pretrained(str(self), trust_remote_code=True)
+        target = self.init()
+        trainer = self.nemo_setup(target)
+
+        self.convert_state(source, target)
+        self.nemo_save(output_path, trainer)
+
+        print(f"Converted SigLIPViT model saved to {output_path}")
+
+        teardown(trainer, target)
+        del trainer, target
+
+        return output_path
+
+    @property
+    def config(self) -> CLIPViTConfig:
+        # pylint: disable=C0115,C0116
+        from transformers import AutoConfig
+
+        source = AutoConfig.from_pretrained(str(self), trust_remote_code=True)
+
+        patch_dim = source.vision_config.patch_size
+        output = SigLIPViT400M_14_384_Config(
+            patch_dim=patch_dim,
+            hidden_size=source.vision_config.hidden_size,
+            img_h=source.vision_config.image_size // patch_dim * patch_dim,
+            img_w=source.vision_config.image_size // patch_dim * patch_dim,
+            ffn_hidden_size=source.vision_config.intermediate_size,
+            num_attention_heads=source.vision_config.num_attention_heads,
+            num_layers=source.vision_config.num_hidden_layers,
+            kv_channels=source.vision_config.hidden_size // source.vision_config.num_attention_heads,
+            num_query_groups=source.vision_config.num_attention_heads,
+        )
+        return output
+
+    def convert_state(self, source, target):
+        # pylint: disable=C0115,C0116
+        mapping = {}
+        mapping.update(
+            {
+                "vision_model.embeddings.patch_embedding.weight": "conv1.weight",
+                "vision_model.embeddings.patch_embedding.bias": "conv1.bias",
+                "vision_model.embeddings.position_embedding.weight": "position_embeddings.weight",
+                "vision_model.post_layernorm.weight": "ln_post.weight",
+                "vision_model.post_layernorm.bias": "ln_post.bias",
+                "vision_model.encoder.layers.*.self_attn.out_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight",
+                "vision_model.encoder.layers.*.self_attn.out_proj.bias": "decoder.layers.*.self_attention.linear_proj.bias",
+                "vision_model.encoder.layers.*.layer_norm1.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
+                "vision_model.encoder.layers.*.layer_norm1.bias": "decoder.layers.*.self_attention.linear_qkv.layer_norm_bias",
+                "vision_model.encoder.layers.*.mlp.fc1.weight": "decoder.layers.*.mlp.linear_fc1.weight",
+                "vision_model.encoder.layers.*.mlp.fc1.bias": "decoder.layers.*.mlp.linear_fc1.bias",
+                "vision_model.encoder.layers.*.mlp.fc2.weight": "decoder.layers.*.mlp.linear_fc2.weight",
+                "vision_model.encoder.layers.*.mlp.fc2.bias": "decoder.layers.*.mlp.linear_fc2.bias",
+                "vision_model.encoder.layers.*.layer_norm2.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
+                "vision_model.encoder.layers.*.layer_norm2.bias": "decoder.layers.*.mlp.linear_fc1.layer_norm_bias",
+            }
+        )
+
+        return io.apply_transforms(
+            source,
+            target,
+            mapping=mapping,
+            transforms=[
+                _import_vision_qkv_bias,
+                _import_vision_qkv,
+            ],
+        )
+
+
+def import_qkv(q, k, v, head_num, num_query_groups, heads_per_group, hidden_size, head_size):
+    # pylint: disable=C0115,C0116
+    old_tensor_shape = q.size()
+    new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+    new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
+
+    q = q.view(*new_q_tensor_shape)
+    k = k.view(*new_kv_tensor_shape)
+    v = v.view(*new_kv_tensor_shape)
+
+    qkv_weights_l = []
+    for i in range(num_query_groups):
+        qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :])
+        qkv_weights_l.append(k[i : i + 1, :, :])
+        qkv_weights_l.append(v[i : i + 1, :, :])
+    qkv_weights = torch.cat(qkv_weights_l)
+    assert qkv_weights.ndim == 3, qkv_weights.shape
+    assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape
+    assert qkv_weights.shape[1] == head_size, qkv_weights.shape
+    assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape
+
+    qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+
+    return qkv_weights
+
+
+@io.state_transform(
+    source_key=(
+        "vision_model.encoder.layers.*.self_attn.q_proj.bias",
+        "vision_model.encoder.layers.*.self_attn.k_proj.bias",
+        "vision_model.encoder.layers.*.self_attn.v_proj.bias",
+    ),
+    target_key="decoder.layers.*.self_attention.linear_qkv.bias",
+)
+def _import_vision_qkv_bias(ctx: io.TransformCTX, q_bias, k_bias, v_bias):
+    # pylint: disable=C0115,C0116
+    megatron_config = ctx.target.config
+    return import_qkv(
+        q_bias.unsqueeze(-1),
+        k_bias.unsqueeze(-1),
+        v_bias.unsqueeze(-1),
+        head_num=megatron_config.num_attention_heads,
+        num_query_groups=megatron_config.num_query_groups,
+        heads_per_group=megatron_config.num_attention_heads // megatron_config.num_query_groups,
+        hidden_size=1,
+        head_size=megatron_config.kv_channels,
+    ).squeeze(-1)
+
+
+@io.state_transform(
+    source_key=(
+        "vision_model.encoder.layers.*.self_attn.q_proj.weight",
+        "vision_model.encoder.layers.*.self_attn.k_proj.weight",
+        "vision_model.encoder.layers.*.self_attn.v_proj.weight",
+    ),
+    target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+)
+def _import_vision_qkv(ctx: io.TransformCTX, q, k, v):
+    # pylint: disable=C0115,C0116
+    megatron_config = ctx.target.config
+    return import_qkv(
+        q,
+        k,
+        v,
+        head_num=megatron_config.num_attention_heads,
+        num_query_groups=megatron_config.num_query_groups,
+        heads_per_group=megatron_config.num_attention_heads // megatron_config.num_query_groups,
+        hidden_size=megatron_config.hidden_size,
+        head_size=megatron_config.kv_channels,
+    )
diff --git a/scripts/vlm/neva_finetune.py b/scripts/vlm/neva_finetune.py
index 81749f8ef03e..cb1db75bd839 100644
--- a/scripts/vlm/neva_finetune.py
+++ b/scripts/vlm/neva_finetune.py
@@ -163,6 +163,7 @@ def main(args):
             overlap_param_gather=True,
             average_in_collective=True,
         ),
+        ckpt_load_strictness="log_all",
     )
 
     model = vlm.NevaModel(neva_config, tokenizer=data.tokenizer)
@@ -187,7 +188,7 @@ def main(args):
         callbacks=[
             checkpoint_callback,
             TimingCallback(),
-            MegatronCommOverlapCallback(tp_comm_overlap=True),
+            MegatronCommOverlapCallback(tp_comm_overlap=False),
         ],
         val_check_interval=500,
         limit_val_batches=gbs,