From dbb0094fd0cb936469e35320bf37e866ef7a1da4 Mon Sep 17 00:00:00 2001
From: DawerG <dawergitesh@gmail.com>
Date: Wed, 1 Nov 2023 19:12:19 +0530
Subject: [PATCH] 7.1 release (#2027)

CI: https://gitlab.com/coremltools1/coremltools/-/pipelines/1056684725
---
 CMakeLists.txt                                |  12 +
 coremltools/_deps/__init__.py                 |  21 +-
 coremltools/converters/_converters_entry.py   | 173 ++++--
 .../converters/mil/backend/mil/load.py        |  50 +-
 coremltools/converters/mil/converter.py       |   7 +-
 .../converters/mil/frontend/milproto/load.py  |  40 +-
 .../mil/frontend/tensorflow/test/test_ops.py  |  27 +
 .../tensorflow2/test/test_v2_ops_tf_keras.py  |   5 +
 .../mil/frontend/torch/converter.py           | 410 +++++---------
 .../mil/frontend/torch/dialect_ops.py         |  55 +-
 .../mil/frontend/torch/edgeir_utils.py        |  34 ++
 .../mil/frontend/torch/internal_graph.py      | 412 +++++++++-----
 .../converters/mil/frontend/torch/load.py     |  73 +--
 .../converters/mil/frontend/torch/ops.py      | 491 +++++++++++------
 .../mil/frontend/torch/quantization_ops.py    |  48 +-
 .../ssa_passes/torch_tensor_assign_to_core.py |  18 +-
 .../mil/frontend/torch/test/test_api.py       |  72 ---
 .../frontend/torch/test/test_custom_ops.py    |   8 +-
 .../torch/test/test_executorch_e2e.py         | 158 ++++++
 .../torch/test/test_internal_graph.py         |   4 +-
 .../torch/test/test_torch_conversion_api.py   | 249 ++++++++-
 .../mil/frontend/torch/test/test_torch_ops.py | 348 +++++++++---
 .../torch/test/test_torch_quantization_ops.py |  55 +-
 .../mil/frontend/torch/test/testing_utils.py  |  64 ++-
 .../mil/frontend/torch/torch_op_registry.py   | 115 +++-
 .../mil/frontend/torch/torchir_passes.py      |  25 +-
 .../mil/frontend/torch/torchscript_utils.py   | 201 +++++++
 coremltools/converters/mil/input_types.py     |   1 +
 coremltools/converters/mil/mil/__init__.py    |  31 +-
 coremltools/converters/mil/mil/block.py       |  31 +-
 coremltools/converters/mil/mil/builder.py     | 112 ++--
 coremltools/converters/mil/mil/input_type.py  |  23 +-
 coremltools/converters/mil/mil/operation.py   |  12 +-
 .../converters/mil/mil/ops/defs/_utils.py     | 154 +++++-
 .../mil/ops/defs/iOS15/elementwise_unary.py   |   2 +-
 .../mil/mil/ops/defs/iOS15/linear.py          |   4 +-
 .../converters/mil/mil/ops/defs/iOS15/pool.py |  30 +-
 .../mil/mil/ops/defs/iOS15/random.py          |  54 +-
 .../mil/ops/defs/iOS15/tensor_operation.py    |   4 +
 .../ops/defs/iOS15/tensor_transformation.py   |  76 +--
 .../mil/mil/ops/defs/iOS16/scatter_gather.py  |  81 ++-
 .../mil/mil/ops/defs/iOS17/scatter_gather.py  |  73 ++-
 .../converters/mil/mil/ops/registry.py        |   5 +-
 .../mil/mil/ops/tests/iOS14/test_conv.py      |  38 ++
 .../mil/mil/ops/tests/iOS14/test_linear.py    |  36 +-
 .../mil/mil/ops/tests/iOS14/test_pool.py      |  38 +-
 .../ops/tests/iOS14/test_scatter_gather.py    |  71 ++-
 .../ops/tests/iOS14/test_tensor_operation.py  |  11 +
 .../tests/iOS14/test_tensor_transformation.py |  40 +-
 .../mil/mil/ops/tests/iOS16/test_conv.py      |   9 +-
 .../ops/tests/iOS16/test_scatter_gather.py    | 102 +++-
 .../mil/mil/ops/tests/iOS17/test_linear.py    |  24 +
 .../ops/tests/iOS17/test_scatter_gather.py    |  55 +-
 .../mil/passes/defs/optimize_quantization.py  | 130 +++++
 .../passes/defs/optimize_tensor_operation.py  |  77 +++
 .../mil/mil/passes/defs/quantization.py       | 324 +++++++----
 .../mil/mil/passes/pass_pipeline.py           |  10 +
 .../mil/passes/tests/test_pass_pipeline.py    |   6 +
 .../mil/mil/passes/tests/test_passes.py       |  85 ++-
 .../passes/tests/test_quantization_passes.py  | 510 +++++++++++++++++-
 coremltools/converters/mil/mil/program.py     | 121 ++++-
 .../converters/mil/mil/tests/test_block.py    |   1 -
 .../converters/mil/mil/tests/test_programs.py | 155 +++++-
 coremltools/converters/mil/testing_utils.py   |   7 +
 coremltools/models/__init__.py                |   1 +
 coremltools/models/model.py                   |   2 +-
 coremltools/optimize/coreml/_config.py        |   5 +-
 .../optimize/coreml/_quantization_passes.py   | 205 ++++---
 .../torch/pruning/magnitude_pruner.py         |   2 +-
 .../test/ml_program/test_compression.py       |  39 +-
 .../neural_network/test_numpy_nn_layers.py    |   1 +
 .../test/neural_network/test_tf_numeric.py    |   5 +
 .../coreml/test_post_training_quantization.py |   9 +-
 coremltools/version.py                        |   2 +-
 reqs/test.pip                                 |   6 +-
 scripts/build.sh                              |   2 +-
 76 files changed, 4511 insertions(+), 1486 deletions(-)
 create mode 100644 coremltools/converters/mil/frontend/torch/edgeir_utils.py
 delete mode 100644 coremltools/converters/mil/frontend/torch/test/test_api.py
 create mode 100644 coremltools/converters/mil/frontend/torch/test/test_executorch_e2e.py
 create mode 100644 coremltools/converters/mil/frontend/torch/torchscript_utils.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e64d03104..d4625252a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -191,6 +191,7 @@ else()
   message(STATUS "CoreML.framework and dependent frameworks not found. Skipping libcoremlpython build.")
 endif()
 
+
 # Build kmeans-1d
 set(KMEANS_DIR "${PROJECT_SOURCE_DIR}/deps/kmeans1d")
 execute_process(
@@ -198,12 +199,23 @@ execute_process(
   WORKING_DIRECTORY ${KMEANS_DIR}
 )
 
+# Somehow Python's setuptools is building this shared object file so that it tries to load the C++
+# standard library using an rpath that only exist on the build machine. Change that so it gets
+# loaded from the standard location.
+if(APPLE)
+  file(GLOB SO_FILE "${PROJECT_SOURCE_DIR}/deps/kmeans1d/kmeans1d/_core.*.so")
+  execute_process(
+    COMMAND install_name_tool -change @rpath/libc++.1.dylib /usr/lib/libc++.1.dylib ${SO_FILE}
+  )
+endif()
+
 # Copy kmeans-1d to Python deps folder
 execute_process(
   COMMAND cp -r kmeans1d ../../coremltools/_deps
   WORKING_DIRECTORY ${KMEANS_DIR}
 )
 
+
 set(PYTHON_TAG "cp${PYTHON_VERSION_MAJOR}${PYTHON_VERSION_MINOR}")
 if(APPLE)
   execute_process(COMMAND uname -m OUTPUT_VARIABLE HARDWARE_NAME OUTPUT_STRIP_TRAILING_WHITESPACE)
diff --git a/coremltools/_deps/__init__.py b/coremltools/_deps/__init__.py
index 01e1f3eb4..4d352840f 100644
--- a/coremltools/_deps/__init__.py
+++ b/coremltools/_deps/__init__.py
@@ -154,10 +154,15 @@ def __get_sklearn_version(version):
 
 # ---------------------------------------------------------------------------------------
 _HAS_TORCH = True
-_TORCH_MAX_VERSION = "2.0.0"
+_TORCH_MAX_VERSION = "2.1.0"
+_HAS_TORCH_EXPORT_API = False
 try:
     import torch
     _warn_if_above_max_supported_version("Torch", torch.__version__, _TORCH_MAX_VERSION)
+
+    if _get_version(torch.__version__) >= _StrictVersion("2.1.0"):
+        _HAS_TORCH_EXPORT_API = True
+
 except:
     _HAS_TORCH = False
 MSG_TORCH_NOT_FOUND = "PyTorch not found."
@@ -170,6 +175,20 @@ def __get_sklearn_version(version):
     _HAS_TORCH_VISION = False
 MSG_TORCH_VISION_NOT_FOUND = "TorchVision not found."
 
+_HAS_TORCH_AUDIO = True
+try:
+    import torchaudio
+except:
+    _HAS_TORCH_AUDIO = False
+MSG_TORCH_AUDIO_NOT_FOUND = "TorchAudio not found."
+
+
+_HAS_EXECUTORCH = True
+try:
+    import executorch
+except:
+    _HAS_EXECUTORCH = False
+MSG_EXECUTORCH_NOT_FOUND = "Executorch not found."
 
 # ---------------------------------------------------------------------------------------
 try:
diff --git a/coremltools/converters/_converters_entry.py b/coremltools/converters/_converters_entry.py
index d808c03da..b00fa3b99 100644
--- a/coremltools/converters/_converters_entry.py
+++ b/coremltools/converters/_converters_entry.py
@@ -15,7 +15,7 @@
 from coremltools import ComputeUnit as _ComputeUnit
 from coremltools import __version__ as _ct_version
 from coremltools import _logger as logger
-from coremltools._deps import _HAS_TF_1, _HAS_TF_2, _HAS_TORCH
+from coremltools._deps import _HAS_TF_1, _HAS_TF_2, _HAS_TORCH, _HAS_TORCH_EXPORT_API
 from coremltools.converters._profile_utils import _profile
 from coremltools.converters.mil._deployment_compatibility import (
     AvailableTarget,
@@ -36,7 +36,7 @@
 from coremltools.converters.mil.mil.passes.defs.quantization import FP16ComputePrecision
 from coremltools.converters.mil.mil.passes.graph_pass import PassOption as _PassOption
 from coremltools.converters.mil.mil.passes.pass_pipeline import PassPipeline
-from coremltools.models import _METADATA_SOURCE, _METADATA_VERSION
+from coremltools.models import _METADATA_SOURCE, _METADATA_SOURCE_DIALECT, _METADATA_VERSION
 from coremltools.models.utils import _MLPACKAGE_EXTENSION
 
 if _HAS_TF_1:
@@ -51,8 +51,13 @@
 if _HAS_TORCH:
     import torch
 
-    from coremltools.converters.mil.frontend.torch.load import \
-        _torchscript_from_model as pytorch_load
+    from coremltools.converters.mil.frontend.torch.load import (
+        _torchscript_from_spec as try_load_torchscript,
+    )
+
+    if _HAS_TORCH_EXPORT_API:
+        from torch.export import ExportedProgram
+
 
 
 @_profile
@@ -102,8 +107,12 @@ def convert(
 
         * PyTorch
 
-            - A `TorchScript <https://pytorch.org/docs/stable/jit.html>`_ object
-            - Path to a ``.pt`` file
+            - TorchScript Models:
+                - A `TorchScript <https://pytorch.org/docs/stable/jit.html>`_ object
+                - Path to a ``.pt`` file
+
+            - Torch Exported Models:
+                - A `ExportedProgram <https://pytorch.org/docs/stable/export.html#torch.export.ExportedProgram> ` object with `EDGE` dialect
 
     source : str (optional)
 
@@ -161,18 +170,23 @@ def convert(
               When ``inputs`` not provided or ``dtype`` not specified. The float 32 inputs defaults to float 16.
 
         * PyTorch:
-            - The ``inputs`` parameter is required.
-            - Number of elements in ``inputs`` must match the number of inputs
-              of the PyTorch model.
-            - ``inputs`` may be a nested list or tuple.
-            - ``TensorType`` and ``ImageType`` must have the ``shape`` specified.
-            - If the ``name`` argument is specified with ``TensorType`` or
-              ``ImageType``, the converted Core ML model will have inputs with
-              the same name.
-            - If ``dtype`` is missing:
-              * For ``minimum_deployment_target <= ct.target.macOS12``, it defaults to float 32.
-              * For ``minimum_deployment_target >= ct.target.macOS13``, and with ``compute_precision`` in float 16 precision.
-                It defaults to float 16.
+
+            - TorchScript Models:
+                - The ``inputs`` parameter is required.
+                - Number of elements in ``inputs`` must match the number of inputs
+                  of the PyTorch model.
+                - ``inputs`` may be a nested list or tuple.
+                - ``TensorType`` and ``ImageType`` must have the ``shape`` specified.
+                - If the ``name`` argument is specified with ``TensorType`` or
+                  ``ImageType``, the converted Core ML model will have inputs with
+                  the same name.
+                - If ``dtype`` is missing:
+                  * For ``minimum_deployment_target <= ct.target.macOS12``, it defaults to float 32.
+                  * For ``minimum_deployment_target >= ct.target.macOS13``, and with ``compute_precision`` in float 16 precision.
+                    It defaults to float 16.
+
+            - Torch Exported Models:
+                - The ``inputs`` parameter is not supported. ``inputs`` parameter is inferred from Torch ExportedProgram.
 
     outputs : list of ``TensorType`` or ``ImageType`` (optional)
 
@@ -218,13 +232,17 @@ def convert(
 
         * PyTorch:
 
-            - If specified, the length of the list must match the number of
-              outputs returned by the PyTorch model.
-            - If ``name`` is specified, it is applied to the output names of the
-              converted Core ML model.
-            - For ``minimum_deployment_target >= ct.target.macOS13``, and with ``compute_precision`` in float 16 precision.
-              If ``dtype`` not specified, the outputs inferred of type float 32
-              defaults to float 16.
+            - TorchScript Models:
+                - If specified, the length of the list must match the number of
+                outputs returned by the PyTorch model.
+                - If ``name`` is specified, it is applied to the output names of the
+                converted Core ML model.
+                - For ``minimum_deployment_target >= ct.target.macOS13``, and with ``compute_precision`` in float 16 precision.
+                If ``dtype`` not specified, the outputs inferred of type float 32
+                defaults to float 16.
+
+            - Torch Exported Models:
+                - The ``outputs`` parameter is not supported. ``outputs`` parameter is inferred from Torch ExportedProgram.
 
 
     classifier_config : ClassifierConfig class (optional)
@@ -308,7 +326,7 @@ def convert(
           The above transform iterates through all the ops, looking at each op's
           inputs and outputs. If they are of type float 32, ``cast``
           ops are injected to convert those tensors (also known as `vars`) to
-          type float 16.
+          type float 16. Similarly, int32 vars will also be cast to int16.
 
         - ``coremltools.precision.FLOAT32`` enum: No transform is applied.
 
@@ -489,15 +507,17 @@ def skip_real_div_ops(op):
 
     PyTorch:
 
-        >>> model = torchvision.models.mobilenet_v2()
-        >>> model.eval()
-        >>> example_input = torch.rand(1, 3, 256, 256)
-        >>> traced_model = torch.jit.trace(model, example_input)
+        TorchScript Models:
 
-        >>> input = ct.TensorType(name='input_name', shape=(1, 3, 256, 256))
-        >>> mlmodel = ct.convert(traced_model, inputs=[input])
-        >>> results = mlmodel.predict({"input": example_input.numpy()})
-        >>> print(results['1651']) # 1651 is the node name given by PyTorch's JIT
+            >>> model = torchvision.models.mobilenet_v2()
+            >>> model.eval()
+            >>> example_input = torch.rand(1, 3, 256, 256)
+            >>> traced_model = torch.jit.trace(model, example_input)
+
+            >>> input = ct.TensorType(name='input_name', shape=(1, 3, 256, 256))
+            >>> mlmodel = ct.convert(traced_model, inputs=[input])
+            >>> results = mlmodel.predict({"input": example_input.numpy()})
+            >>> print(results['1651']) # 1651 is the node name given by PyTorch's JIT
 
     See `Conversion Options <https://coremltools.readme.io/docs/neural-network-conversion>`_ for
     more advanced options.
@@ -508,6 +528,7 @@ def skip_real_div_ops(op):
                                      outputs_as_strings,
                                      outputs_as_tensor_or_image_types,
                                      outputs)
+    source_dialect = _determine_source_dialect(model, exact_source)
     exact_target = _determine_target(convert_to, minimum_deployment_target)
     _validate_conversion_arguments(
         model,
@@ -525,7 +546,7 @@ def skip_real_div_ops(op):
     if pass_pipeline is None:
         pass_pipeline = PassPipeline()
     if not need_fp16_cast_pass:
-        pass_pipeline.remove_passes({"common::add_fp16_cast"})
+        pass_pipeline.remove_passes({"common::add_fp16_cast", "common::add_int16_cast"})
     if isinstance(compute_precision, FP16ComputePrecision):
         # For backward compatibility with the `op_selector` param in FP16ComputePrecision.
         pass_pipeline._pass_options["common::add_fp16_cast"] = [
@@ -584,7 +605,7 @@ def skip_real_div_ops(op):
 
     gc.collect()
 
-    mlmodel = _record_build_metadata(mlmodel, exact_source)
+    mlmodel = _record_build_metadata(mlmodel, exact_source, source_dialect=source_dialect)
 
     return mlmodel
 
@@ -819,16 +840,45 @@ def _flatten_list(_inputs):
             raise ValueError("Input should be a list of TensorType or ImageType")
 
     elif exact_source == "pytorch":
-        if inputs is None:
-            raise ValueError('Expected argument for pytorch "inputs" not provided')
+        if _HAS_TORCH_EXPORT_API and isinstance(model, ExportedProgram):
+            if model.dialect != "EDGE":
+                raise NotImplementedError(
+                    f"Conversion for models with only EDGE dialect is supported/tested. Provided Dialect: {model.dialect}"
+                )
 
-        raise_if_duplicated(flat_inputs)
-        if inputs is not None and not all(
-            [isinstance(_input, InputType) for _input in flat_inputs]
-        ):
-            raise ValueError(
-                "Input should be a list/tuple (or nested lists/tuples) of TensorType or ImageType"
-            )
+            # TODO: rdar://115845792 ([Executorch] Handle user provided inputs/outputs in the convert API)
+            if inputs is not None:
+                raise AssertionError("'inputs' argument should be None for ExportedProgram")
+
+            if outputs is not None:
+                raise AssertionError("'outputs' argument should be None for ExportedProgram")
+
+        else:
+            is_torch_load_successful = False
+            try:
+                try_load_torchscript(model)
+                is_torch_load_successful = True
+            except:
+                pass
+            if is_torch_load_successful:
+                if inputs is None:
+                    raise ValueError(
+                        'Expected argument "inputs" for TorchScript models not provided'
+                    )
+
+                raise_if_duplicated(flat_inputs)
+                if inputs is not None and not all(
+                    [isinstance(_input, InputType) for _input in flat_inputs]
+                ):
+                    raise ValueError(
+                        "Input should be a list/tuple (or nested lists/tuples) of TensorType or ImageType"
+                    )
+            else:
+                raise TypeError(
+                    "@model must either be a TorchScript object (or .pt or .pth file) or an ExportedProgram object (if using torch.export based API), received: {}".format(
+                        type(model)
+                    )
+                )
 
     elif exact_source == "milinternal":
         if not isinstance(model, Program):
@@ -837,6 +887,19 @@ def _flatten_list(_inputs):
             )
 
 
+def _determine_source_dialect(model, exact_source):
+
+    source_dialect = None
+    if exact_source == "pytorch":
+
+        if _HAS_TORCH_EXPORT_API and isinstance(model, ExportedProgram):
+            return f"TorchExport::{model.dialect}"
+        else:
+            return "TorchScript"
+
+    return source_dialect
+
+
 def _determine_source(model, source,
                       output_names,
                       outputs_as_tensor_or_image_types,
@@ -875,9 +938,13 @@ def _determine_source(model, source,
             pass
 
     if source == "auto" and _HAS_TORCH:
+
+        if _HAS_TORCH_EXPORT_API and isinstance(model, ExportedProgram):
+            return "pytorch"
+
         is_torch_load_successful = False
         try:
-            pytorch_load(model)
+            try_load_torchscript(model)
             is_torch_load_successful = True
         except:
             pass
@@ -953,6 +1020,12 @@ def _get_metadata_from_mlmodel(mlmodel):
     src_pkg_version = mlmodel.user_defined_metadata[_METADATA_SOURCE]
     coremltools_version = mlmodel.user_defined_metadata[_METADATA_VERSION]
 
+    src_dialect = (
+        None
+        if _METADATA_SOURCE_DIALECT not in mlmodel.user_defined_metadata
+        else mlmodel.user_defined_metadata[_METADATA_SOURCE_DIALECT]
+    )
+
     src_pkg_version_list = src_pkg_version.split("==")
     if len(src_pkg_version_list) == 0:
         src_pkg, pkg_ver = None, None
@@ -969,10 +1042,13 @@ def _get_metadata_from_mlmodel(mlmodel):
     if src_pkg is not None and pkg_ver is not None:
         build_info['coremltools-component-' + src_pkg] = str(pkg_ver)
 
+    if src_dialect is not None:
+        build_info["coremltools-source-dialect"] = src_dialect
+
     return build_info
 
 
-def _record_build_metadata(mlmodel, exact_source):
+def _record_build_metadata(mlmodel, exact_source, source_dialect=None):
     # recording metadata: coremltools version, source framework and version
     if exact_source in {"tensorflow", "tensorflow2"} and (_HAS_TF_1 or _HAS_TF_2):
         src_pkg_version = "tensorflow=={0}".format(tf.__version__)
@@ -986,6 +1062,9 @@ def _record_build_metadata(mlmodel, exact_source):
     mlmodel.user_defined_metadata[_METADATA_SOURCE] = src_pkg_version
     mlmodel.user_defined_metadata[_METADATA_VERSION] = _ct_version
 
+    if source_dialect is not None:
+        mlmodel.user_defined_metadata[_METADATA_SOURCE_DIALECT] = source_dialect
+
     build_info = _get_metadata_from_mlmodel(mlmodel)
 
     mlmodel._set_build_info_mil_attributes(build_info)
diff --git a/coremltools/converters/mil/backend/mil/load.py b/coremltools/converters/mil/backend/mil/load.py
index 8f2c9d2ed..216eba527 100644
--- a/coremltools/converters/mil/backend/mil/load.py
+++ b/coremltools/converters/mil/backend/mil/load.py
@@ -5,6 +5,7 @@
 
 import os
 import warnings
+from typing import Optional
 
 import numpy as np
 
@@ -22,7 +23,7 @@
 from coremltools.converters.mil.backend.nn.load import _set_optional_inputs
 from coremltools.converters.mil.input_types import EnumeratedShapes, ImageType, RangeDim, TensorType
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import Function, mil_list, types
+from coremltools.converters.mil.mil import Function, Program, mil_list, types
 from coremltools.converters.mil.mil.ops.registry import SSAOpRegistry
 from coremltools.converters.mil.mil.types.symbolic import any_symbolic, any_variadic, is_symbolic
 from coremltools.models.neural_network.flexible_shape_utils import (
@@ -282,21 +283,17 @@ def remove_output(block, prob_var):
         return out[0].name, out[1].name
 
 
-def load(prog, weights_dir, resume_on_errors=False, specification_version=_SPECIFICATION_VERSION_IOS_15, **kwargs):
+def _pymil_to_milproto(
+    prog: Program,
+    weights_dir: str,
+    specification_version: Optional[int] = _SPECIFICATION_VERSION_IOS_15,
+) -> pm.Program:
+    """
+    Convert a pymil program into mil proto.
+    """
     if BlobWriter is None:
         raise RuntimeError("BlobWriter not loaded")
-    if "main" not in prog.functions:
-        raise ValueError("main function not found in program")
 
-    # if user has specified "ClassifierConfig", then add the "classify" op to the prog
-    classifier_config = kwargs.get("classifier_config", None)
-    predicted_feature_name = None
-    predicted_probabilities_name = None
-    if classifier_config is not None:
-        predicted_feature_name, predicted_probabilities_name = _add_classify_op(prog, classifier_config)
-
-    input_types = prog.main_input_types
-    output_types = prog.main_output_types
     weight_path = os.path.join(weights_dir, _WEIGHTS_FILE_NAME)
     blob_writer = BlobWriter(weight_path)
 
@@ -310,6 +307,33 @@ def load(prog, weights_dir, resume_on_errors=False, specification_version=_SPECI
         version=1,
         functions=function_protos,
     )
+    return proto
+
+
+def load(
+    prog: Program,
+    weights_dir: str,
+    resume_on_errors: Optional[bool] = False,
+    specification_version: Optional[int] = _SPECIFICATION_VERSION_IOS_15,
+    **kwargs,
+):
+    if "main" not in prog.functions:
+        raise ValueError("main function not found in program")
+
+    # if user has specified "ClassifierConfig", then add the "classify" op to the prog
+    classifier_config = kwargs.get("classifier_config", None)
+    predicted_feature_name = None
+    predicted_probabilities_name = None
+    if classifier_config is not None:
+        predicted_feature_name, predicted_probabilities_name = _add_classify_op(
+            prog, classifier_config
+        )
+
+    # convert pymil program into mil proto
+    proto = _pymil_to_milproto(prog, weights_dir, specification_version)
+
+    input_types = prog.main_input_types
+    output_types = prog.main_output_types
 
     desc = kwargs.get("model_description", None)
     if desc and not isinstance(desc, ml.ModelDescription):
diff --git a/coremltools/converters/mil/converter.py b/coremltools/converters/mil/converter.py
index 3fae1cb94..72f11769c 100644
--- a/coremltools/converters/mil/converter.py
+++ b/coremltools/converters/mil/converter.py
@@ -277,7 +277,7 @@ def mil_convert_to_proto(
         # behaviour same as before, the quantization pass is removed in this situation.
         # TODO: rdar://106111553 ([Infra] Quantization Pass is skipped when `mil_convert` is called directly.)
         main_pipeline = PassPipeline()
-        main_pipeline.remove_passes({"common::add_fp16_cast"})
+        main_pipeline.remove_passes({"common::add_fp16_cast", "common::add_int16_cast"})
     frontend_pipeline, backend_pipeline = _construct_other_pipelines(
         main_pipeline, convert_from, convert_to
     )
@@ -288,12 +288,13 @@ def mil_convert_to_proto(
 
     PassPipelineManager.apply_pipeline(prog, main_pipeline)
 
-    prog._check_invalid_program()
-
     if convert_to == 'milinternal':
         return None, prog
 
     PassPipelineManager.apply_pipeline(prog, backend_pipeline)
+
+    prog._check_early_error_out_for_invalid_program()
+
     backend_converter_type = converter_registry.backends.get(convert_to.lower())
     if not backend_converter_type:
         raise NotImplementedError(
diff --git a/coremltools/converters/mil/frontend/milproto/load.py b/coremltools/converters/mil/frontend/milproto/load.py
index 054ef871c..87da83595 100644
--- a/coremltools/converters/mil/frontend/milproto/load.py
+++ b/coremltools/converters/mil/frontend/milproto/load.py
@@ -411,25 +411,10 @@ def _load_function(context, func_spec, spec_version):
     return pymil_func
 
 
-def load(model_spec, specification_version, file_weights_dir="", **kwargs):
+def load_mil_proto(program_spec, specification_version, file_weights_dir=""):
     """
-    Load MILProto to Pymil.
-
-    Set force_spec_version to force override the spec version.
+    Load in-memory Proto specification of MILSpec.Program(.Proto) object to PyMIL
     """
-    if not isinstance(model_spec, ml.Model):
-        raise TypeError("Invalid Model sepc object")
-
-    if specification_version < model_spec.specificationVersion:
-        if not kwargs.get("force_spec_version", False):
-            raise ValueError(
-                "specification_version must be greater or equal to the input model spec version"
-            )
-
-    if model_spec.WhichOneof("Type") != "mlProgram":
-        raise ValueError("Only MIL proto based mlmodels can be loaded")
-
-    program_spec = model_spec.mlProgram
     if not isinstance(program_spec, pm.Program):
         raise TypeError("Invalid Program spec object")
 
@@ -451,3 +436,24 @@ def load(model_spec, specification_version, file_weights_dir="", **kwargs):
             raise ValueError("Invalid attribute for program")
 
     return pymil_program
+
+
+def load(model_spec, specification_version, file_weights_dir="", **kwargs):
+    """
+    Load in-memory Proto specification of Model(.Proto) object to PyMIL
+
+    Set force_spec_version to force override the spec version.
+    """
+    if not isinstance(model_spec, ml.Model):
+        raise TypeError("Invalid Model sepc object")
+
+    if specification_version < model_spec.specificationVersion:
+        if not kwargs.get("force_spec_version", False):
+            raise ValueError(
+                "specification_version must be greater or equal to the input model spec version"
+            )
+
+    if model_spec.WhichOneof("Type") != "mlProgram":
+        raise ValueError("Only MIL proto based mlmodels can be loaded")
+
+    return load_mil_proto(model_spec.mlProgram, specification_version, file_weights_dir)
diff --git a/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py b/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py
index da22af453..33795f7e9 100644
--- a/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py
+++ b/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py
@@ -2622,6 +2622,15 @@ def test_ios17_resize_bilinear_dynamic_shape(
         target_shape,
         align_corners,
     ):
+        if (
+            backend == ("mlprogram", "fp16")
+            and input_shape == (2, 5, 2, 3)
+            and target_shape == (20, 60)
+        ):
+            pytest.xfail(
+                "rdar://116060011: re-activate coremltools tests blocked by Core ML regressions"
+            )
+
         """
         Since iOS17, dynamic shape is supported by lowering to `resize` MIL op.
         """
@@ -2723,6 +2732,15 @@ def test_ios17_resize_nearest_neighbor_dynamic_shape(
         input_shape,
         target_shape,
     ):
+        if (
+            backend == ("mlprogram", "fp16")
+            and input_shape == (2, 5, 2, 3)
+            and target_shape == (20, 60)
+        ):
+            pytest.xfail(
+                "rdar://116060011: re-activate coremltools tests blocked by Core ML regressions"
+            )
+
         """
         Since iOS17, dynamic shape is supported by lowering to `resize` MIL op.
         """
@@ -6742,6 +6760,15 @@ def build_model(x):
     def test_programmatic(
         self, compute_unit, backend, input_block_rank, dynamic_input, dynamic_crops
     ):
+        if (
+            backend == ("mlprogram", "fp16")
+            and input_block_rank == (3, 1)
+            and dynamic_input
+            and not dynamic_crops
+        ):
+            pytest.xfail(
+                "rdar://116060011: re-activate coremltools tests blocked by Core ML regressions"
+            )
 
         input_rank, block_rank = input_block_rank
 
diff --git a/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops_tf_keras.py b/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops_tf_keras.py
index 47c8247c5..ee673e810 100644
--- a/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops_tf_keras.py
+++ b/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops_tf_keras.py
@@ -1389,6 +1389,11 @@ def test_lstm_time_distributed_dense(self, compute_unit, backend):
         "compute_unit, backend", itertools.product(compute_units, backends)
     )
     def test_lstm_dynamic_batch(self, compute_unit, backend):
+        if backend == ("mlprogram", "fp16"):
+            pytest.xfail(
+                "rdar://116060011: re-activate coremltools tests blocked by Core ML regressions"
+            )
+
         input_shape = (1, 1280)
         inp = tf.keras.layers.Input(shape=input_shape)
         out, hn, cn = tf.keras.layers.LSTM(512,
diff --git a/coremltools/converters/mil/frontend/torch/converter.py b/coremltools/converters/mil/frontend/torch/converter.py
index 9e0be95c3..0ae3e5218 100644
--- a/coremltools/converters/mil/frontend/torch/converter.py
+++ b/coremltools/converters/mil/frontend/torch/converter.py
@@ -4,17 +4,20 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 from collections import OrderedDict
+from typing import List, Optional, Union
 
 import numpy as np
 import torch as torch
+from torch.jit._script import RecursiveScriptModule
 
 from coremltools import _logger as logger
-from coremltools._deps import version_lt
+from coremltools._deps import _HAS_TORCH_EXPORT_API
 from coremltools.converters.mil._deployment_compatibility import AvailableTarget as _target
-from coremltools.converters.mil.input_types import ImageType, TensorType
+from coremltools.converters.mil.input_types import ImageType, InputType, TensorType
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import Function, Program, types
+from coremltools.converters.mil.mil import Function, Placeholder, Program, types
 from coremltools.converters.mil.mil.types import is_float
+from coremltools.converters.mil.mil.var import Var
 
 from .._utils import get_output_names
 from .internal_graph import InternalTorchIRGraph, InternalTorchIRNode
@@ -28,26 +31,37 @@
     remove_getattr_nodes,
     transform_inplace_ops,
 )
-
-torch_to_mil_types = {
-    torch.bool: types.bool,
-    torch.float16: types.fp16,
-    torch.float32: types.fp32,
-    torch.float64: types.fp32,
-    torch.int32: types.int32,
-    torch.int64: types.int32,
-}
-
-
-mil_to_torch_types = {v: k for k, v in torch_to_mil_types.items()}
-
+from .torchscript_utils import torch_to_mil_types
+
+if _HAS_TORCH_EXPORT_API:
+    from torch.export import ExportedProgram
+
+
+def _convert_to_torch_inputtype(inputs: List[TensorType]) -> List[TensorType]:
+    input_type = []
+    for _input in inputs:
+        if isinstance(_input, (list, tuple)):
+            input_type.append(_convert_to_torch_inputtype(_input))
+        elif isinstance(_input, InputType):
+            if _input.shape is None:
+                raise ValueError(
+                    "'shape' must be provided in the 'inputs' argument for pytorch conversion"
+                )
+            input_type.append(_input)
+        elif isinstance(_input, torch.Tensor):
+            input_type.append(
+                TensorType(shape=_input.shape, dtype=torch_to_mil_types[_input.dtype])
+            )
+        else:
+            raise ValueError("Unknown type {} for conversion to InputType.".format(type(_input)))
+    return input_type
 
 class QuantizationContext:
     """
     Utilities to manage information pertaining to quantization of tensors in a PyTorch graph.
     """
 
-    def __init__(self, context):
+    def __init__(self, context: "TranscriptionContext") -> None:
         self._context = context
 
         # Maps var name to tuple of (torch dtype, scale, zero_point)
@@ -71,7 +85,7 @@ def add_quantization_info(self, name, torch_dtype, scale, zero_point, axis=None)
         """
         self._quant_param_map[name] = (torch_dtype, scale, zero_point, axis)
 
-    def get_quantization_info(self, name):
+    def get_quantization_info(self, name: str) -> None:
         """
         Retrieves the information added via add_quantization_info, if applicable.
         Returns None if quantization parameters could not be found.
@@ -80,7 +94,7 @@ def get_quantization_info(self, name):
             return None
         return self._quant_param_map[name]
 
-    def maybe_handle_quantized_inputs(self, node: InternalTorchIRNode):
+    def maybe_handle_quantized_inputs(self, node: InternalTorchIRNode) -> None:
         """
         If a node's op doesn't support quantized inputs but gets one, this will wire it to
         receive a dequantized version of it.
@@ -91,14 +105,15 @@ def maybe_handle_quantized_inputs(self, node: InternalTorchIRNode):
             # Op can handle quantized inputs. Nothing to do here.
             return
 
-        for input_name in node.inputs:
-            if self.get_quantization_info(input_name) is None:
+        for input in node.inputs:
+            # In Edge IR, input can be a literal and thus have no name
+            if not isinstance(input, str) or self.get_quantization_info(input) is None:
                 # Not a quantized tensor
                 continue
 
             # We need a dequantized version of the input to feed to the op.
-            dequantized_var, _ = self.get_dequantized_var(input_name)
-            node.replace_name(input_name, dequantized_var.name)
+            dequantized_var, _ = self.get_dequantized_var(input)
+            node.replace_name(input, dequantized_var.name)
 
     def get_quantized_per_tensor(self, name, torch_dtype, scale, zero_point, quantized_name):
         """
@@ -179,7 +194,7 @@ class TranscriptionContext:
     context when stepping out.
     """
 
-    def __init__(self, name=None):
+    def __init__(self, name: Optional[str] = None) -> None:
         self.name = name if name else ""
         self._current_graph = [{}]
         self._torch_graph = None
@@ -192,21 +207,24 @@ def torch_graph(self):
         return self._torch_graph
 
     @property
-    def quant_context(self):
+    def quant_context(self) -> QuantizationContext:
         return self._quant_context
 
     @torch_graph.setter
     def torch_graph(self, graph: InternalTorchIRGraph):
         self._torch_graph = graph
 
-    def prepare_for_conversion(self, node: InternalTorchIRNode):
+    def prepare_for_conversion(self, node: InternalTorchIRNode) -> None:
         """
         Perform any preparation necessary before node-specific frontend conversion
         is invoked.
         """
-        self.quant_context.maybe_handle_quantized_inputs(node)
+        return
 
-    def add(self, ssa_var, torch_name=None):
+    def process_inplace_op(self, node: InternalTorchIRNode):
+        return
+
+    def add(self, ssa_var: Var, torch_name: Optional[str] = None, override=False) -> None:
         """
         Arguments:
             ssa_var: Variable to add to the graph being constructed.
@@ -215,12 +233,12 @@ def add(self, ssa_var, torch_name=None):
         """
         if torch_name is None:
             torch_name = ssa_var.name
-        if torch_name in self._current_graph[-1]:
-            print(f"Torch var {torch_name} is added again.")
+        if torch_name in self._current_graph[-1] and not override:
+            logger.warning(f"Torch var {torch_name} is added again.")
             return
         self._current_graph[-1][torch_name] = ssa_var
 
-    def __getitem__(self, torch_name):
+    def __getitem__(self, torch_name: str) -> Var:
         """
         Lookup a name in the context. Note that since nested blocks must be
         able to access anything that was defined before them, we have to
@@ -276,26 +294,26 @@ def __repr__(self):
 
 class TorchConverter:
     """
-    Class that handles conversion of pytorch models represented in TorchScript
-    format to the MIL format.
+    Class that handles conversion of pytorch models to the MIL format.
 
     Models passed to the @TorchConverter go from:
-    TorchScript -> Expanded/Optimized Torch IR -> Internal Graph -> CoreML SSA
-    The internal graph representation was added to make testing easier.
+    Loaded-Torch Model -> Internal Graph -> PyMIL
     """
 
     def __init__(
         self,
-        torchscript,
-        inputs,
-        outputs=None,
-        cut_at_symbols=None,
-        opset_version=None,
-        use_default_fp16_io=False,
-    ):
+        loaded_model: Union[RecursiveScriptModule, "ExportedProgram"],
+        inputs: Optional[List[TensorType]] = None,
+        outputs: Optional[List[TensorType]] = None,
+        cut_at_symbols: Optional[List[str]] = None,
+        opset_version: Optional[int] = None,
+        use_default_fp16_io: bool = False,
+    ) -> None:
         """
         Arguments:
-            torchscript: torch.jit.ScriptModule object representing the model to convert.
+            loaded_model: It could be one of the following:
+                    - In-memory TorchScript model of type torch.jit.ScriptModule
+                    - In-memory EdgeIR program of type ExportedProgram
             inputs: Input values and optional names. See kwarg in load.py for full description.
             outputs: List of outputs as ct.InputType. See kwarg in load.py for full description.
             cut_at_symbols: A list of internal symbol name strings. Graph conversion will
@@ -307,46 +325,54 @@ def __init__(
                 and the compute precision set to fp16, this flag is True.
                 When True, fp32 i/o defaults to fp16.
         """
-        assert isinstance(torchscript, torch.jit.ScriptModule)
-
-        self.inputs = inputs
-        for idx, inp in enumerate(self.inputs):
-            if isinstance(inp, ImageType) and self.inputs[idx].channel_first is None:
-                self.inputs[idx].channel_first = True
-
-        self.torchscript = torchscript
-        self.outputs = outputs
         self.use_default_fp16_io = use_default_fp16_io
 
-        if self.use_default_fp16_io:
-            # If the input type is not specified by the user and use_default_fp16_io
-            # is True. Make the default input type to fp16
-            self._adjust_default_input_to_fp16()
+        if inputs is not None:
+            inputs = _convert_to_torch_inputtype(inputs)
+            self.inputs = inputs
+            for idx, inp in enumerate(self.inputs):
+                if isinstance(inp, ImageType) and self.inputs[idx].channel_first is None:
+                    self.inputs[idx].channel_first = True
+
+            if self.use_default_fp16_io:
+                # If the input type is not specified by the user and use_default_fp16_io
+                # is True. Make the default input type to fp16
+                self._adjust_default_input_to_fp16()
 
+        self.outputs = outputs
         self.output_names = get_output_names(self.outputs)
         self.opset_version = _target(opset_version) if opset_version is not None else None
         self.context = TranscriptionContext()
-        raw_graph, params_dict = self._expand_and_optimize_ir(self.torchscript)
-        self.params_dict = params_dict
-        self.graph = InternalTorchIRGraph(
-            raw_graph, params_dict, self.inputs, cut_at_symbols
-        )
-        self.context.torch_graph = self.graph
+        self._prog = Program()
+
+        if isinstance(loaded_model, torch.jit.ScriptModule):
+            self.graph, self.params_dict, self.buffer_dict = InternalTorchIRGraph.from_torchscript(
+                torchscript=loaded_model, input_values=self.inputs, cut_at_symbols=cut_at_symbols
+            )
+
+            # TODO (rdar://106161395): Register Torch IR passes and unify them into the pass pipeline.
+            # Apply Torch IR passes
+            passes = [
+                transform_inplace_ops,
+                flatten_graph_input_values,
+                flatten_graph_output_values,
+                remove_getattr_nodes,
+                generate_tensor_assignment_ops,
+            ]
+            for p in passes:
+                p(self.graph)
+
+        elif _HAS_TORCH_EXPORT_API and isinstance(loaded_model, ExportedProgram):
+            self.graph = InternalTorchIRGraph.from_edgeir(edgeir=loaded_model)
+            self.params_dict, self.buffer_dict = None, None
+        else:
+            raise ValueError(
+                "Model should be an instance of either torch.jit.ScriptModule or ExportedProgram"
+            )
 
-        # TODO (rdar://106161395): Register Torch IR passes and unify them into the pass pipeline.
-        # Apply Torch IR passes
-        passes = [
-            transform_inplace_ops,
-            flatten_graph_input_values,
-            flatten_graph_output_values,
-            remove_getattr_nodes,
-            generate_tensor_assignment_ops,
-        ]
-        for p in passes:
-            p(self.graph)
+        self.context.torch_graph = self.graph
 
         self.inputs = list(self.graph.inputs.values())
-        self._prog = Program()
 
     def _adjust_default_input_to_fp16(self):
         """
@@ -389,7 +415,7 @@ def _check_ops(graph):
         implemented_ops = set()
         missing_ops = set()
         for node in graph.nodes:
-            _add_op = _TORCH_OPS_REGISTRY.get(node.kind, None)
+            _add_op = _TORCH_OPS_REGISTRY.get_func(node.kind)
             if _add_op is None:
                 missing_ops.add(node.kind)
             else:
@@ -401,7 +427,9 @@ def _check_ops(graph):
         return implemented_ops, missing_ops
 
     @staticmethod
-    def _create_placeholder(_input):
+    def _create_placeholder(
+        _input: TensorType,
+    ) -> Placeholder:
         """
         Converts an InputType into a Placeholder.
 
@@ -416,6 +444,14 @@ def _create_placeholder(_input):
             dtype = types.fp32
         return mb.placeholder(shape, dtype=dtype)
 
+    @staticmethod
+    def _preprocess_input_vars(input_var):
+        if (
+            types.is_tensor(input_var.sym_type) or types.is_scalar(input_var.sym_type)
+        ) and input_var.dtype == types.fp16:
+            input_var = mb.cast(x=input_var, dtype="fp32")
+        return input_var
+
     def check_ops(self):
         """
         Returns the set of ops in @self.graph that are implemented, and
@@ -423,7 +459,7 @@ def check_ops(self):
         """
         return TorchConverter._check_ops(self.graph)
 
-    def convert_const(self):
+    def convert_const(self) -> None:
         for name, val in self.graph.params.items():
             if isinstance(val, torch._C.ScriptObject):
                 logger.info(f"Encountered constant {name} of type _torch._C.ScriptObject")
@@ -444,40 +480,39 @@ def convert_const(self):
             const = mb.const(val=val, name=name)
             self.context.add(const)
 
-    def convert(self):
+    def convert(self) -> Program:
         logger.info("Converting graph.")
 
-        # This will hold the converted model.
-        prog = self._prog
-
-        # Construct placeholder for input to SSA function
-        # This is where input renaming occurs
-        ssa_func_inputs = OrderedDict()
+        # Set SSA function input name to user defined name if provided.
         for index, (name, spec) in enumerate(self.graph.inputs.items()):
-            placeholder = self._create_placeholder(spec)
-            # Set SSA function input name to user defined name if provided.
             if spec.name is not None:
                 name = spec.name
             self.inputs[index].name = name
-            ssa_func_inputs[name] = placeholder
+
+        # This will hold the converted model.
+        prog = self._prog
         prog.set_main_input_types(tuple(self.inputs))
 
+        # Construct placeholder for input to SSA function
+        ssa_func_inputs = OrderedDict()
+        for spec in self.inputs:
+            ssa_func_inputs[spec.name] = self._create_placeholder(spec)
+
         # Initialize the SSA for conversion
         with Function(ssa_func_inputs, opset_version=self.opset_version) as ssa_func:
 
             # Map internal @self.graph.inputs to user specified @ssa_func_inputs
             # If @self.graph.inputs == @ssa_func_inputs this just adds the inputs
             # to the context.
-            for internal_name, users_name in zip(
-                self.graph.inputs.keys(), ssa_func_inputs.keys()
-            ):
-                input_var = ssa_func.inputs[users_name]
-                if (
-                    types.is_tensor(input_var.sym_type) or types.is_scalar(input_var.sym_type)
-                ) and input_var.dtype == types.fp16:
-                    input_var = mb.cast(x=input_var, dtype="fp32")
-                self.context.add(input_var, torch_name=internal_name)
-
+            # Convert input placeholders
+            user_names = list(ssa_func_inputs.keys())
+            internal_names = list(self.graph.inputs.keys())
+            internal_names.extend(user_names[len(internal_names) :])
+            for torch_name, ssa_name in zip(internal_names, user_names):
+                input_var = self._preprocess_input_vars(ssa_func.inputs[ssa_name])
+                self.context.add(input_var, torch_name=torch_name)
+
+            # Convert constants
             self.convert_const()
 
             # Add the rest of the operations
@@ -513,184 +548,3 @@ def convert(self):
             if self.outputs is not None:
                 prog.set_main_output_types(self.outputs)
         return prog
-
-    def _jit_pass_lower_graph(graph, torchscript):
-        """
-        This graph pass does a similar thing as torch._C._jit_pass_lower_graph does.
-        It does two things:
-        1. Rename getattr nodes which produce a torch tensor to match the keys in torch model's state_dict
-        2. Construct the params_dict, with the keys similar to state_dict
-
-        To be more specific, this graph pass traces down series of GetAttr ops, and rename the final node to match the torch model state_dict.
-        It also replaces the node inputs by the first created tensor node with the same name.
-
-        Example:
-        Input graph:
-        graph(%self.1 : __torch__.torch.nn.modules.Sequential, %input.1 : Tensor):
-        %2 : prim::GetAttr[name="linear"](%self.1)
-        %3 : prim::GetAttr[name="weight"](%2)
-        %4 : prim::GetAttr[name="bias"](%2)
-        %5 : prim::GetAttr[name="bias"](%2) # duplicated node
-        %6 : conv(%input.1, %3, %4)
-        %7 : add(%input.1, %5)
-        return (%6, %7)
-
-        Output graph:
-        graph(%self.1 : __torch__.torch.nn.modules.Sequential, %input.1 : Tensor):
-        %2 : prim::GetAttr[name="linear"](%self.1)
-        %linear.weight : prim::GetAttr[name="weight"](%2)
-        %linear.bias : prim::GetAttr[name="bias"](%2)
-        %5 : prim::GetAttr[name="bias"](%2) # duplicated node, it is not used now
-        %6 : conv(%input.1, %linear.weight, %linear.bias)
-        %7 : add(%input.1, %linear.bias) # the second input is replaced
-        return (%6, %7)
-
-        And a dictionary {"linear.weight": ..., "linear.bias": ...} is returned, to record the parameters values.
-        Note that, those GetAttr nodes are still in the torch ir graph, but they would be removed in a latter
-        graph pass in the coremltools torch internal graph
-
-        """
-
-        """
-        Each getattr node corresponds to a torch object in the torch IR,
-        it could be either:
-        1. torch.nn.modules: submodule in a torch model. For instance, a linear layer in a MLP network.
-        2. torch.Tensor: torch model parameters. For instance, weight for a conv layer.
-        3. torch._C.ScriptObject: quantized torch model parameters.
-        For example, in the graph above, %2 is pointing to the __torch__.torch.nn.modules.Sequential.linear torch submodule.
-        node_to_module_map tracks these mapping.
-
-        node_to_prefic_map track the name for each module,
-        for example, %2 has the prefix name linear and %3 is linear.weight.
-        These names are also keys in the state_dict
-        """
-        node_to_module_map = {}
-        node_to_prefix_map = {}
-        first_node_with_prefix = {}
-        replace_input = {}
-
-        base_module_node = list(graph.inputs())[0]
-        node_to_module_map[base_module_node] = torchscript
-        node_to_prefix_map[base_module_node] = ""
-
-        """
-        params_dict will be contructed in this graph pass. It contains all const tensors needed for the graph computation.
-        And the value is validated against the state_dict if the key is presented in both dictionaries.
-        In some rare cases, state_dict lacks parameters / buffers, so we still need to go through the while graph ourselves.
-        """
-        params_dict = {}
-        state_dict = torchscript.state_dict(keep_vars=True)
-
-        def _check_is_tensor(node, module):
-            if not isinstance(module, torch.Tensor):
-                return False
-            if str(node.output().type()) not in ("Tensor", "Optional[Tensor]"):
-                raise TypeError(f'Type "{node.output().type()}" not supported')
-            return True
-
-        def _check_is_quantized_tensor(node, module):
-            if not isinstance(module, torch._C.ScriptObject):
-                return False
-            # We only support ScriptObjects that correspond to quantized packed params.
-            assert "PackedParams" in node.output().type().name()
-            return True
-
-        def _lower_graph_block(graph):
-            for node in list(graph.nodes()):
-
-                for block in node.blocks():
-                    _lower_graph_block(block)
-
-                for idx, _input in enumerate(list(node.inputs())):
-                    if _input in replace_input:
-                        node.replaceInput(idx, replace_input[_input])
-
-                kind = node.kind().split("::")[1].lower()
-                if kind != "getattr":
-                    continue
-
-                _input = node.input()
-                _output = node.output()
-                attr_name = getattr(node, node.kindOf("name"))("name")
-
-                module = getattr(node_to_module_map[_input], attr_name)
-                node_to_module_map[_output] = module
-
-                input_prefix = node_to_prefix_map[_input]
-                prefix = input_prefix + '.' + attr_name if input_prefix != "" else attr_name
-                node_to_prefix_map[_output] = prefix
-
-                is_tensor = _check_is_tensor(node, module)
-                is_quantized_tensor = _check_is_quantized_tensor(node, module)
-
-                if is_tensor or is_quantized_tensor:
-                    if is_tensor and prefix in state_dict:
-                        assert torch.equal(
-                            module.cpu(), state_dict[prefix].cpu()
-                        ), "tensor value not consistent between torch ir and state_dict"
-                    if prefix in params_dict:
-                        assert torch.equal(module.cpu(), params_dict[prefix].cpu())
-                        replace_input[_output] = first_node_with_prefix[prefix]
-                    else:
-                        params_dict[prefix] = module
-                        first_node_with_prefix[prefix] = _output
-                        _output.setDebugName(prefix)
-
-        _lower_graph_block(graph)
-
-        return graph, params_dict
-
-    @staticmethod
-    def _expand_and_optimize_ir(torchscript):
-        """
-        Given a torch.jit.ScriptModule, convert it to a optimized
-        torch._C.Graph and dict of model parameter's names to tensors.
-        """
-        graph = torchscript.forward.graph
-
-        # From PyTorch code: Inline function and method calls.
-        torch._C._jit_pass_inline(graph)
-        # From PyTorch code: This inlines the forked section in the fork()
-        # callsite and replaces uses of the result of wait() calls with the
-        # values produced from the (now-inlined) forked section.
-        torch._C._jit_pass_inline_fork_wait(graph)
-        # Starting from the return node, marks all nodes that feed into the
-        # output, as well as nodes with side effects. Any nodes not marked are
-        # eliminated.
-        torch._C._jit_pass_dce(graph)
-        # From PyTorch code: checks well-formedness and invariants of graph.
-        torch._C._jit_pass_lint(graph)
-        # Replaces a couple specific ops patterns (add, sub, mul, div, chunk).
-        if version_lt(torch, "1.6.0"):
-            torch._C._jit_pass_canonicalize_ops(graph)
-            torch._C._jit_pass_lint(graph)
-
-            # From PyTorch code: This pass catches all of the small, easy to catch
-            # peephole optimizations you might be interested in doing.
-            #     Eliminate no-op 'expand' nodes
-            #     Simplify x.t().t() to x
-            # pass disabled for v1.6.0 and onwards, wrongly captures the shape of dummy inputs during tracing.
-            torch._C._jit_pass_peephole(graph, addmm_fusion_enabled=False)
-        else:
-            # v1.6.0 pass renamed
-            torch._C._jit_pass_canonicalize_graph_fuser_ops(graph)
-        torch._C._jit_pass_lint(graph)
-
-        # From PyTorch docs: Renumber the graph so that all structurally
-        # equivalent graphs have same numbers.
-        graph = torch._C._jit_pass_canonicalize(graph)
-        torch._C._jit_pass_lint(graph)
-        if version_lt(torch, "1.6.0"):
-            # v1.6.0 JIT changes disallows pulling list values out of
-            # prim::Constant. We can only pull scalar values. constant
-            # propagation removes `listConstruct` and results in list values.
-            # We disallow constant prop pass to keep them as scalars, and rely
-            # on our own constant prop to interpret `listConstruct`.
-            torch._C._jit_pass_constant_propagation(graph)
-        # NOTE: Don't need another DCE, it's included in constant propagation.
-        torch._C._jit_pass_lint(graph)
-
-        # Get the params_dict and rename the getattr nodes in the graph
-        graph, params_dict = TorchConverter._jit_pass_lower_graph(graph, torchscript)
-
-        return graph, params_dict
diff --git a/coremltools/converters/mil/frontend/torch/dialect_ops.py b/coremltools/converters/mil/frontend/torch/dialect_ops.py
index 101144c6c..6796328ed 100644
--- a/coremltools/converters/mil/frontend/torch/dialect_ops.py
+++ b/coremltools/converters/mil/frontend/torch/dialect_ops.py
@@ -4,14 +4,10 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 from coremltools.converters.mil.mil import Operation, get_new_symbol, types
-from coremltools.converters.mil.mil.input_type import (DefaultInputs,
-                                                       InputSpec,
-                                                       TensorInputType)
-from coremltools.converters.mil.mil.ops.defs._utils import \
-    solve_slice_by_index_shape
+from coremltools.converters.mil.mil.input_type import DefaultInputs, InputSpec, TensorInputType
+from coremltools.converters.mil.mil.ops.defs._utils import get_param_val, solve_slice_by_index_shape
 from coremltools.converters.mil.mil.ops.registry import SSAOpRegistry
-from coremltools.converters.mil.mil.types.symbolic import \
-    is_compatible_symbolic_vector
+from coremltools.converters.mil.mil.types.symbolic import is_compatible_symbolic_vector
 
 register_op = SSAOpRegistry.register_op
 
@@ -60,7 +56,7 @@ class torch_upsample_nearest_neighbor(Operation):
         output_height=TensorInputType(type_domain=types.int32),
         output_width=TensorInputType(type_domain=types.int32),
     )
- 
+
     type_domains = {
         "T": (types.fp16, types.fp32),
     }
@@ -144,11 +140,11 @@ class torch_tensor_assign(Operation):
 
     Parameters
     ----------
-    data: tensor<*?, T> (Required)
+    x: tensor<*?, T> (Required)
         * Input tensor
     updates: tensor<\*K, T> (Required)
         * Value tensor to be inserted
-        * The shape of the updates tensor must match the slicing result of the input data.
+        * The shape of the updates tensor must match the slicing result of the input data ``x``.
     begin: tensor<[rank<x>], i32> (Required)
         * Starting index for the dimension of slicing.
     end: tensor<[rank(x)], i32> (Required)
@@ -164,7 +160,7 @@ class torch_tensor_assign(Operation):
         * If ``end_mask[i]==True``, neglect ``end[i]``, and set ``end[i]`` to ``x.shape[i]``.
     squeeze_mask: tensor<[rank(x)], bool> (Optional)
         * Default to all ``False``.
-        * If ``squeeze_mask[i]==true``, neglect ``end[i]``, and do the pure index at ``begin[i]``.
+        * If ``squeeze_mask[i]==True``, neglect ``end[i]``, and do the pure index at ``begin[i]``.
 
     Returns
     -------
@@ -177,7 +173,7 @@ class torch_tensor_assign(Operation):
     """
 
     input_spec = InputSpec(
-        data=TensorInputType(type_domain="T"),
+        x=TensorInputType(type_domain="T"),
         updates=TensorInputType(type_domain="T"),
         begin=TensorInputType(type_domain=types.int32),
         end=TensorInputType(type_domain=types.int32),
@@ -186,7 +182,7 @@ class torch_tensor_assign(Operation):
         end_mask=TensorInputType(const=True, optional=True, type_domain=types.bool),
         squeeze_mask=TensorInputType(const=True, optional=True, type_domain=types.bool),
     )
-    
+
     type_domains = {
         "T": (types.fp16, types.fp32, types.int32),
     }
@@ -200,20 +196,21 @@ def default_inputs(self):
         )
 
     def type_inference(self):
-        # Verify the updates and the data slicing have the same shape
-        begin = self.begin.val
-        end = self.end.val
-        data_rank = self.data.rank
-        stride = self.stride.val if self.stride is not None else [1] * data_rank
-        begin_mask = (
-            self.begin_mask.val if self.begin_mask is not None else [False] * data_rank
-        )
-        end_mask = self.end_mask.val if self.end_mask is not None else [False] * data_rank
-        squeeze_mask = (
-            self.squeeze_mask.val if self.squeeze_mask is not None else [False] * data_rank
+        # solve shape
+        ret_shape = solve_slice_by_index_shape(
+            self.x.shape,
+            self.begin.val,
+            self.end.val,
+            get_param_val(self.stride),
+            get_param_val(self.begin_mask),
+            get_param_val(self.end_mask),
+            get_param_val(self.squeeze_mask),
         )
-        data_shape = self.data.shape
-        expected_updates_shape = tuple(solve_slice_by_index_shape(data_shape, begin, end, stride, begin_mask, end_mask, squeeze_mask))
-        if not is_compatible_symbolic_vector(expected_updates_shape, self.updates.shape):
-            raise ValueError("The updates tensor should have shape {}. Got {}".format(expected_updates_shape, self.updates.shape))
-        return self.data.sym_type
+        if not is_compatible_symbolic_vector(ret_shape, self.updates.shape):
+            raise ValueError(
+                "The updates tensor should have shape {}. Got {}".format(
+                    ret_shape, self.updates.shape
+                )
+            )
+
+        return self.x.sym_type
diff --git a/coremltools/converters/mil/frontend/torch/edgeir_utils.py b/coremltools/converters/mil/frontend/torch/edgeir_utils.py
new file mode 100644
index 000000000..26d0ad821
--- /dev/null
+++ b/coremltools/converters/mil/frontend/torch/edgeir_utils.py
@@ -0,0 +1,34 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+
+from typing import List
+
+from torch import Tensor
+
+import coremltools as ct
+
+from .torchscript_utils import torch_to_mil_types
+
+
+
+def to_coreml_tensor_type(name: str, tensor: Tensor) -> "ct.TensorType":
+    # TODO: rdar://115845948 ([Executorch] Handle inputs of shapes with dynamic dimensions)
+    return ct.TensorType(name=name, dtype=torch_to_mil_types[tensor.dtype], shape=tensor.shape)
+
+
+def extract_inputs_from_edge_program(exported_program) -> List["ct.TensorType"]:
+    module = exported_program.graph_module
+    inputs_to_parameters = exported_program.graph_signature.inputs_to_parameters
+    inputs_to_buffers = exported_program.graph_signature.inputs_to_buffers
+    inputs = []
+    for node in module.graph.nodes:
+        if node.op == "placeholder" and node.meta is not None and "val" in node.meta:
+            if isinstance(node.meta["val"], Tensor):
+                if node.name not in inputs_to_parameters and node.name not in inputs_to_buffers:
+                    inputs.append(to_coreml_tensor_type(node.name, node.meta["val"]))
+            else:
+                raise NotImplementedError("Only Tensor inputs handled yet")
+    return inputs
diff --git a/coremltools/converters/mil/frontend/torch/internal_graph.py b/coremltools/converters/mil/frontend/torch/internal_graph.py
index b6fd83507..cbf782d54 100644
--- a/coremltools/converters/mil/frontend/torch/internal_graph.py
+++ b/coremltools/converters/mil/frontend/torch/internal_graph.py
@@ -3,10 +3,15 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+from collections import OrderedDict
+
 import torch
+from torch.fx.node import Node
 
-from collections import OrderedDict
-from itertools import islice
+from coremltools import _logger as logger
+
+from .edgeir_utils import extract_inputs_from_edge_program
+from .torchscript_utils import _expand_and_optimize_ir
 
 _DEFAULT_OP_NAMESPACES = set(["aten", "prim"])
 
@@ -57,42 +62,56 @@ class InternalTorchIRBlock:
     coremltools internal representation of a torch IR block.
     """
 
-    def __init__(self, raw_block=None, parent=None, nodes=None, inputs=None, outputs=None):
-        """"
+    def __init__(self, parent=None, nodes=None, inputs=None, outputs=None):
+        """
         Arguments:
-            raw_block: The torch._C.Block to convert, or None.
             parent: The InternalTorchIRNode this block belongs to.
-            nodes: If @raw_block is None, the list of InternalTorchIRNodes in the block
-            inputs: If @raw_block is None, the list of input symbols.
-            outputs: If @raw_block is None, the list of output symbols.
+            nodes: list of InternalTorchIRNodes in the block
+            inputs: list of input symbols.
+            outputs: list of output symbols.
         """
 
-        self.nodes = []
-        node_names = set()
-        self.inputs = []
-        self.outputs = []
+        self.nodes = nodes
+        self.inputs = inputs
+        self.outputs = outputs
         self.parent = parent
 
-        if raw_block:
-            # Add nodes
-            for raw_node in raw_block.nodes():
-                new_node = InternalTorchIRNode(raw_node, parent=self)
-                if new_node.name == new_node.kind:
-                    new_node.name = _find_new_name(new_node.name, node_names)
-                self.nodes.append(new_node)
-                node_names.add(new_node.name)
-
-            # Add inputs
-            for inp in raw_block.inputs():
-                self.inputs.append(inp.debugName())
-
-            # Add outputs
-            for outp in raw_block.outputs():
-                self.outputs.append(outp.debugName())
-        else:
-            self.nodes = nodes
-            self.inputs = inputs
-            self.outputs = outputs
+    @classmethod
+    def from_edgeir_block(cls, block, parent):
+        raise NotImplementedError(
+            "EdgeIR: Support for Ops containing blocks not implemented yet"
+        )  # TODO: rdar://115846569 ([Executorch] Handle control flow ops from edge ir)
+
+    @classmethod
+    def from_torchscript_block(cls, block, parent):
+
+        node_names = set()
+        nodes = []
+        inputs = []
+        outputs = []
+
+        # Add inputs
+        for inp in block.inputs():
+            inputs.append(inp.debugName())
+
+        # Add outputs
+        for outp in block.outputs():
+            outputs.append(outp.debugName())
+
+        internal_block = cls(parent=parent, inputs=inputs, outputs=outputs, nodes=nodes)
+
+        # Add nodes
+        for raw_node in block.nodes():
+            new_node = InternalTorchIRNode.from_torchscript_node(
+                node=raw_node, parent=internal_block
+            )
+            if new_node.name == new_node.kind:
+                new_node.name = _find_new_name(new_node.name, node_names)
+            internal_block.nodes.append(new_node)
+            node_names.add(new_node.name)
+
+        return internal_block
+
 
     def __str__(self, indent=2):
         indent_str = " " * indent
@@ -131,51 +150,124 @@ class InternalTorchIRNode:
     """
 
     def __init__(
-        self, node=None, parent=None, attr=None, inputs=None, outputs=None, kind=None, blocks=None,
+        self,
+        kind,
+        inputs,
+        outputs,
+        name=None,
+        parent=None,
+        attr=None,
+        blocks=None,
     ):
         """
         Arguments:
-            node: The torch._C.Node to convert, or None.
+            name: Name of the node.
+            kind: the kind (op) of the node.
+            inputs: list of input symbols.
+            outputs: list of output symbols.
             parent: The InternalTorchIRGraph/Block this node belongs to.
-            attr: If @node is not specified, the dict of named attributes.
-            inputs: If @node is not specified, the list of input symbols.
-            outputs: If @node is not specified, the list of output symbols.
-            kind: If @node is not specified, the kind (op) of the node.
-            blocks: If @node is not specified, the list of InternalTorchIRBlock.
+            attr:  dict of named attributes.
+            blocks: list of InternalTorchIRBlock.
         """
+        if not name and not outputs:
+            self.name = ""
+        else:
+            self.name = name if name else outputs[0]
 
+        self.kind = kind
+        self.inputs = inputs
+        self.outputs = outputs
         self.parent = parent
-        if node is not None:
-            self.inputs = [_input.debugName() for _input in node.inputs()]
-            self.outputs = [output.debugName() for output in node.outputs()]
-            namespace = node.kind().split("::")[0].lower()
-            if namespace in _DEFAULT_OP_NAMESPACES:
-                # We conventionally skip the aten/prim namespaces in our naming.
-                self.kind = node.kind().split("::")[-1].lower()
-            else:
-                self.kind = node.kind().lower()
-            self.blocks = [InternalTorchIRBlock(raw_block=b, parent=self) for b in node.blocks()]
-            self.attr = {
-                name: getattr(node, node.kindOf(name))(name)
-                for name in node.attributeNames()
-            }
-            if "value" not in self.attr:
-                self.attr["value"] = None
-            # If the output is boolean, explicitly cast it so type inference
-            # will work correctly.
-            if len(self.outputs) == 1 and next(node.outputs()).type().str() == "bool":
-                self.attr["value"] = bool(self.attr["value"])
+        self.attr = attr if attr is not None else {"value": None}
+        self.blocks = blocks if blocks is not None else []
+
+    @classmethod
+    def from_torchscript_node(cls, node, parent):
+        inputs = [_input.debugName() for _input in node.inputs()]
+        outputs = [output.debugName() for output in node.outputs()]
+        namespace = node.kind().split("::")[0].lower()
+        if namespace in _DEFAULT_OP_NAMESPACES:
+            # We conventionally skip the aten/prim namespaces in our naming.
+            kind = node.kind().split("::")[-1].lower()
         else:
-            self.inputs = inputs
-            self.outputs = outputs
-            self.kind = kind
-            self.blocks = blocks if blocks is not None else []
-            self.attr = attr if attr is not None else {"value": None}
+            kind = node.kind().lower()
+
+        attr = {name: getattr(node, node.kindOf(name))(name) for name in node.attributeNames()}
+        if "value" not in attr:
+            attr["value"] = None
+        # If the output is boolean, explicitly cast it so type inference
+        # will work correctly.
+        if len(outputs) == 1 and next(node.outputs()).type().str() == "bool":
+            attr["value"] = bool(attr["value"])
+
         # On rare occassions, a node has no outputs. In that case, the node's
         # name will be its kind. However, this no longer guarantees the node's
         # name is unique. It will be up to the graph constructing the node to
         # make sure names are unique.
-        self.name = self.outputs[0] if len(self.outputs) > 0 else self.kind
+        name = outputs[0] if len(outputs) > 0 else kind
+
+        internal_node = cls(
+            name=name,
+            kind=kind,
+            parent=parent,
+            inputs=inputs,
+            outputs=outputs,
+            attr=attr,
+            blocks=None,
+        )
+        internal_node.blocks = [
+            InternalTorchIRBlock.from_torchscript_block(block=b, parent=internal_node)
+            for b in node.blocks()
+        ]
+        return internal_node
+
+    @classmethod
+    def from_edgeir_node(cls, node):
+        def get_arguments(alist):
+            args = []
+            for i in alist:
+                if isinstance(i, Node):
+                    args.append(i.name)
+                elif isinstance(i, torch.fx.immutable_collections.immutable_list):
+                    args.append(get_arguments(i))
+                elif isinstance(i, (int, float)):
+                    args.append(i)
+                elif i is None:
+                    args.append(None)
+                else:
+                    raise AssertionError(f"Unhandled type of the node: {type(i)}")
+            return tuple(args)
+
+        inputs = get_arguments(node.args)
+        outputs = [
+            node.name
+        ]  # TODO: rdar://115846125 ([Executorch] Handle Models/Layers with Multiple outputs)
+
+        try:
+            kind = node.target.name()
+        except:
+            if callable(node.target):
+                kind = node.target.__name__
+            else:
+                kind = str(node.target)
+
+        namespace = kind.split("::")[0].lower()
+        if namespace in _DEFAULT_OP_NAMESPACES:
+            # We conventionally skip the aten/prim namespaces in our naming.
+            kind = kind.split("::")[-1].lower()
+        else:
+            kind = kind.lower()
+
+        name = node.name
+        return cls(
+            name=name,
+            kind=kind,
+            inputs=inputs,
+            outputs=outputs,
+            parent=None,
+            attr=None,
+            blocks=None,
+        )
 
     def __str__(self, indent=2):
         node_str = " " * indent + "{} = {}".format(
@@ -229,73 +321,151 @@ class InternalTorchIRGraph:
     """
 
     def __init__(
-            self, raw_graph=None, params_dict=None, input_values=None, cut_at_symbols=None,
-            nodes=None, params=None, inputs=None, outputs=None,
+        self,
+        params,
+        inputs,
+        outputs,
+        nodes=None,
     ):
         """
         Arguments:
-            raw_graph: raw_graph: The torch._C.Graph to convert, or None.
-            params_dict: A dictionary mapping graph parameter names to tensors.
-                Must be given if @raw_graph is not None.
+            params: dict mapping parameter names to their numpy value.
+            inputs: OrderedDict mapping input names to their example values.
+            outputs: list[str], list of outputs from the graph.
+            nodes: list of InternalTorchIRNodes in the graph.
+        """
+        self.nodes = nodes
+        self.params = params
+        self.inputs = inputs
+        self.outputs = outputs
+
+    @classmethod
+    def from_torchscript(cls, torchscript, input_values=None, cut_at_symbols=None):
+        """
+        Arguments:
+            torchscript: TorchScript object representing the model to convert.
             input_values: A list of inputs to the graph. Must be given is
                 @raw_graph if not None.
             cut_at_symbols: The list of desired outputs from the graph. Symbols
                 must be present in the graph. For debugging use only. Can only
                 be given if @raw_graph is not None.
-            nodes: If @raw_graph is None, the list of InternalTorchIRNodes in
-                the graph.
-            params: If @raw_graph is None, the dict mapping parameter names to
-                their numpy value.
-            inputs: If @raw_graph is None, the OrderedDict mapping input names
-                to their example values.
-            outputs: list[str], If @raw_graph is None, the list of outputs from the graph.
         """
+        if not isinstance(torchscript, torch.jit.ScriptModule):
+            raise AssertionError(
+                f"Input should be an object of type torch.jit.ScriptModule. Provide: {type(torchscript)}"
+            )
 
-        self.nodes = []
-        node_names = set()
-        self.params = {}
-        self.inputs = OrderedDict()
-        self.outputs = []
-
-        if raw_graph is not None:
-            # Add nodes
-            for raw_node in raw_graph.nodes():
-                new_node = InternalTorchIRNode(raw_node, parent=self)
-                if new_node.name == new_node.kind:
-                    new_node.name = _find_new_name(new_node.name, node_names)
-                self.nodes.append(new_node)
-                node_names.add(new_node.name)
-
-            # Add params
-            for name, param in params_dict.items():
-                if isinstance(param, torch.Tensor):
-                    if param.is_quantized:
-                        value = param
-                    else:
-                        value = param.detach().cpu().numpy()
-                else:
+        if hasattr(torchscript, "training") and torchscript.training:
+            logger.warning(
+                "Model is not in eval mode. "
+                "Consider calling '.eval()' on your model prior to conversion"
+            )
+        if type(torchscript) == torch.jit._script.RecursiveScriptModule:
+            logger.warning(
+                "Support for converting Torch Script Models is experimental. "
+                "If possible you should use a traced model for conversion."
+            )
+
+        nodes = []
+        params = {}
+        inputs = OrderedDict()
+        outputs = []
+
+        raw_graph, params_dict, buffer_dict = _expand_and_optimize_ir(torchscript)
+
+        # Add params
+        for name, param in params_dict.items():
+            if isinstance(param, torch.Tensor):
+                if param.is_quantized:
                     value = param
-                self.params[name] = value
-
-            # Add inputs
-            # The first element of the raw_graph.inputs() is the 'self' of the module, which is not used.
-            graph_inputs = list(raw_graph.inputs())[1:]
-            for index, _input in enumerate(islice(graph_inputs, len(input_values))):
-                name = _input.debugName()
-                value = input_values[index]
-                self.inputs[name] = value
-
-            # Add outputs, cutting if @cut_at_symbols is set
-            output_names = cut_at_symbols
-            if output_names is None:
-                output_names = [x.debugName() for x in raw_graph.outputs()]
-            for output in output_names:
-                self.outputs.append(output)
-        else:
-            self.nodes = nodes
-            self.params = params
-            self.inputs = inputs
-            self.outputs = outputs
+                else:
+                    value = param.detach().cpu().numpy()
+            else:
+                value = param
+            params[name] = value
+
+        # Add inputs
+        # The first element of the raw_graph.inputs() is the 'self' of the module, which is not used.
+        graph_inputs = list(raw_graph.inputs())[1:]
+        if len(graph_inputs) != len(input_values):
+                raise ValueError(
+                    f"Number of TorchScript inputs ({len(graph_inputs)}) must match the user provided inputs ({len(input_values)})."
+                )
+        for index, _input in enumerate(graph_inputs):
+            name = _input.debugName()
+            value = input_values[index]
+            inputs[name] = value
+
+        # Add outputs, cutting if @cut_at_symbols is set
+        output_names = cut_at_symbols
+        if output_names is None:
+            output_names = [x.debugName() for x in raw_graph.outputs()]
+        for output in output_names:
+            outputs.append(output)
+
+        internal_graph = cls(nodes=nodes, params=params, inputs=inputs, outputs=outputs)
+
+        node_names = set()
+        # Add nodes
+        for raw_node in raw_graph.nodes():
+            new_node = InternalTorchIRNode.from_torchscript_node(
+                node=raw_node, parent=internal_graph
+            )
+            if new_node.name == new_node.kind:
+                new_node.name = _find_new_name(new_node.name, node_names)
+            internal_graph.nodes.append(new_node)
+            node_names.add(new_node.name)
+
+        return internal_graph, params_dict, buffer_dict
+
+    @classmethod
+    def from_edgeir(cls, edgeir):
+        exported_program = edgeir
+
+        nodes = []
+        params = {}
+        outputs = []
+        inputs = OrderedDict(
+            [
+                (i.name, i)
+                for i in extract_inputs_from_edge_program(exported_program=exported_program)
+            ]
+        )
+
+        inputs_to_parameters = exported_program.graph_signature.inputs_to_parameters
+        inputs_to_buffers = exported_program.graph_signature.inputs_to_buffers
+
+        inputs_to_consts = {**inputs_to_parameters, **inputs_to_buffers}
+
+        parameters_to_inputs = {
+            v: k if not k.startswith("%") else k[1:] for k, v in inputs_to_consts.items()
+        }
+
+        # Add params
+        for name, param in exported_program.state_dict.items():
+            if isinstance(param, torch.Tensor):
+                value = param.detach().cpu().numpy()
+            else:
+                raise NotImplementedError("Only torch.Tensor handled yet")
+
+            params[name if name not in parameters_to_inputs else parameters_to_inputs[name]] = value
+
+        graph = exported_program.graph
+
+        outputs = []
+        for node in graph.nodes:
+            if node.op == "call_function":
+                nodes.append(InternalTorchIRNode.from_edgeir_node(node=node))
+            elif node.op == "placeholder":
+                continue
+            elif node.op == "output":
+                outputs = [
+                    node.name for node in node.args[0]
+                ]  # TODO: rdar://115846125 ([Executorch] Handle Models/Layers with Multiple outputs)
+            else:
+                raise NotImplementedError(f"Nodes of type {node.op} not yet implemented")
+
+        return cls(nodes=nodes, params=params, inputs=inputs, outputs=outputs)
 
     def __str__(self):
         graph_str = "graph(\n"
diff --git a/coremltools/converters/mil/frontend/torch/load.py b/coremltools/converters/mil/frontend/torch/load.py
index ca822776d..38229634b 100644
--- a/coremltools/converters/mil/frontend/torch/load.py
+++ b/coremltools/converters/mil/frontend/torch/load.py
@@ -4,32 +4,40 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 import os.path as _os_path
+from typing import List, Optional, Union
 
 import torch as _torch
+from torch.jit._script import RecursiveScriptModule
 
-from coremltools import _logger as logger
-from coremltools.converters.mil.input_types import InputType, TensorType
+from coremltools._deps import _HAS_TORCH_EXPORT_API
+from coremltools.converters.mil.frontend.torch.converter import TorchConverter
+from coremltools.converters.mil.input_types import TensorType
+from coremltools.converters.mil.mil.program import Program
 
-from .converter import TorchConverter, torch_to_mil_types
+from .converter import TorchConverter
 
+if _HAS_TORCH_EXPORT_API:
+    from torch.export import ExportedProgram
 
 def load(
-    model_spec,
-    inputs,
-    specification_version,
-    debug=False,
-    outputs=None,
-    cut_at_symbols=None,
-    use_default_fp16_io=False,
+    spec: Union[RecursiveScriptModule, "ExportedProgram", str],
+    inputs: List[TensorType],
+    specification_version: int,
+    debug: bool = False,
+    outputs: Optional[List[TensorType]] = None,
+    cut_at_symbols: Optional[List[str]] = None,
+    use_default_fp16_io: bool = False,
     **kwargs
-):
+) -> Program:
     """
     Convert PyTorch model to mil CoreML format.
 
     Parameters
     ----------
-    model_spec: String path to .pt file, or a TorchScript object representing
-        the model to convert.
+    spec: It could be one of the following:
+        - String path to .pt file containing serialized torchscript model
+        - In memory TorchScript model of type torch.jit.ScriptModule
+        - In memory EdgeIR program of type ExportedProgram
     inputs: Can be a singular element or list of elements of the following form
         1. Any subclass of InputType
         2. torch.Tensor (only shape and dtype will be used)
@@ -54,28 +62,25 @@ def load(
         and the compute precision set to fp16, this flag is True.
         When True, fp32 i/o defaults to fp16.
     """
-    torchscript = _torchscript_from_model(model_spec)
 
-    if hasattr(torchscript, 'training') and torchscript.training:
-        logger.warning("Model is not in eval mode. "
-                         "Consider calling '.eval()' on your model prior to conversion")
-    if type(torchscript) == _torch.jit._script.RecursiveScriptModule:
-        logger.warning("Support for converting Torch Script Models is experimental. "
-                         "If possible you should use a traced model for conversion.")
+    if _HAS_TORCH_EXPORT_API and isinstance(spec, ExportedProgram):
+        model = spec
+    else:
+        model = _torchscript_from_spec(spec)
 
-    inputs = _convert_to_torch_inputtype(inputs)
     converter = TorchConverter(
-        torchscript,
+        model,
         inputs,
         outputs,
         cut_at_symbols,
         specification_version,
         use_default_fp16_io,
     )
+
     return _perform_torch_convert(converter, debug)
 
 
-def _torchscript_from_model(model_spec):
+def _torchscript_from_spec(model_spec: RecursiveScriptModule) -> RecursiveScriptModule:
     if isinstance(model_spec, str) and (model_spec.endswith(".pt") or model_spec.endswith(".pth")):
         filename = _os_path.abspath(model_spec)
         return _torch.jit.load(filename)
@@ -88,28 +93,8 @@ def _torchscript_from_model(model_spec):
             )
         )
 
-def _convert_to_torch_inputtype(inputs):
-    input_type = []
-    for _input in inputs:
-        if isinstance(_input, (list, tuple)):
-            input_type.append(_convert_to_torch_inputtype(_input))
-        elif isinstance(_input, InputType):
-            if _input.shape is None:
-                raise ValueError("'shape' must be provided in the 'inputs' argument for pytorch conversion")
-            input_type.append(_input)
-        elif isinstance(_input, _torch.Tensor):
-            input_type.append(
-                TensorType(
-                    shape=_input.shape, dtype=torch_to_mil_types[_input.dtype]
-                )
-            )
-        else:
-            raise ValueError(
-                "Unknown type {} for conversion to InputType.".format(type(_input))
-            )
-    return input_type
 
-def _perform_torch_convert(converter, debug):
+def _perform_torch_convert(converter: TorchConverter, debug: bool) -> Program:
     try:
         prog = converter.convert()
     except RuntimeError as e:
diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index e15adc3be..51b968d2a 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -8,7 +8,7 @@
 import numbers
 import re
 from collections.abc import Iterable
-from typing import List, Optional
+from typing import Any, List, Optional
 
 import numpy as _np
 import numpy as np
@@ -67,31 +67,29 @@ def convert_nodes(context, graph):
     """
     for node in _tqdm(graph.nodes, desc="Converting PyTorch Frontend ==> MIL Ops", unit=" ops"):
         op_lookup = node.kind
-        if op_lookup.startswith("__") and op_lookup.endswith("__"):
-            # Some ops may have double underscore, such as `__and__`.
-            op_lookup = op_lookup[2:-2]
-        elif op_lookup.endswith("_"):
-            # This is an "in place" op.
-            # Look up the standard op instead by removing underscore.
-            op_lookup = op_lookup[:-1]
-        add_op = _TORCH_OPS_REGISTRY.get(op_lookup, None)
-
-        logger.info("Converting op {} : {}".format(node.name, node.kind))
+        add_op = _TORCH_OPS_REGISTRY.get_func(op_lookup)
         if add_op is None:
-            if re.match(r".*_dynamic", node.kind):
+            if re.match(r".*_dynamic", op_lookup):
                 raise RuntimeError(
-                    f"PyTorch convert function for op '{node.kind}' not implemented.\n"
+                    f"PyTorch convert function for op '{op_lookup}' not implemented.\n"
                     "Dynamic quantized models are not supported by Core ML.\n"
                     "Please use static quantization or the APIs in coremltools.optimize to quantize/compress models."
                 )
             else:
                 raise RuntimeError(
-                    f"PyTorch convert function for op '{node.kind}' not implemented."
+                    f"PyTorch convert function for op '{op_lookup}' not implemented."
                 )
 
+        logger.info("Converting op {} : {}".format(node.name, op_lookup))
+
+        context.quant_context.maybe_handle_quantized_inputs(node)
         context.prepare_for_conversion(node)
+
         add_op(context, node)
 
+        if _TORCH_OPS_REGISTRY.is_inplace_op(op_lookup):
+            context.process_inplace_op(node)
+
         # We've generated all the outputs the graph needs, terminate conversion.
         if _all_outputs_present(context, graph):
             break
@@ -196,7 +194,34 @@ def _get_inputs(context, node, expected=None, min_expected=None) -> List[Var]:
     @expected is not None, also verifies the number of inputs matches the
     value of @expected.
     """
-    inputs = [context[name] for name in node.inputs]
+
+    def get_bindings(alist) -> List[Any]:
+        """
+        This utility is needed in order to handle following cases:
+            With EdgeIR,
+            - Some of the inputs can be literals (like axis, perms) and thus can be of types: list, int etc.
+            - An Input Parameter of an op could be a list/tuple similar to our concat layer
+        """
+        results = []
+
+        for i in alist:
+            if isinstance(i, str):
+                results.append(context[i])
+            elif isinstance(i, (list, tuple)) and all(isinstance(j, int) for j in i):
+                results.append(mb.const(val=i))
+            elif isinstance(i, (list, tuple)):
+                results.append(get_bindings(i))
+            elif isinstance(i, (int, float)):
+                results.append(mb.const(val=i))
+            elif i is None:
+                results.append(None)
+            else:
+                raise NotImplementedError(f"Binding of inputs of type {type(i)} not handled yet")
+
+        return results
+
+    inputs = get_bindings(node.inputs)
+
     if expected is not None:
         expected = [expected] if not isinstance(expected, (list, tuple)) else expected
 
@@ -694,9 +719,9 @@ def eq(context, node):
     x = inputs[0]
     y = inputs[1]
     if is_bool(x.dtype):
-        x = mb.cast(x=x, dtype='int32')
+        x = mb.cast(x=x, dtype="int32")
     if is_bool(y.dtype):
-        y = mb.cast(x=y, dtype='int32')
+        y = mb.cast(x=y, dtype="int32")
     x, y = promote_input_dtypes([x, y])
     equal_to = mb.equal(x=x, y=y, name=node.name)
     context.add(equal_to)
@@ -708,9 +733,9 @@ def ne(context, node):
     x = inputs[0]
     y = inputs[1]
     if is_bool(x.dtype):
-        x = mb.cast(x=x, dtype='int32')
+        x = mb.cast(x=x, dtype="int32")
     if is_bool(y.dtype):
-        y = mb.cast(x=y, dtype='int32')
+        y = mb.cast(x=y, dtype="int32")
     x, y = promote_input_dtypes([x, y])
     equal_to = mb.not_equal(x=x, y=y, name=node.name)
     context.add(equal_to)
@@ -774,8 +799,8 @@ def transpose(context, node):
     context.add(res)
 
 
-@register_torch_op
-def permute(context, node):
+@register_torch_op(torch_alias=["permute"])
+def permute_copy(context, node):
     inputs = _get_inputs(context, node, expected=2)
     perm = mb.transpose(x=inputs[0], perm=inputs[1], name=node.name)
     context.add(perm)
@@ -807,18 +832,19 @@ def pixel_unshuffle(context, node):
     context.add(perm)
 
 
-@register_torch_op(torch_alias=["bmm"])
+@register_torch_op(torch_alias=["bmm", "mm"])
 def matmul(context, node):
     inputs = _get_inputs(context, node, expected=2)
     if inputs[1].val is not None and \
             len(inputs[1].shape) == 2 and len(inputs[0].shape) <= 3:
         res = mb.linear(x=inputs[0], weight=_np.transpose(inputs[1].val), name=node.name)
     else:
-        res = mb.matmul(x=inputs[0], y=inputs[1], name=node.name)
+        x, y = promote_input_dtypes([inputs[0], inputs[1]])
+        res = mb.matmul(x=x, y=y, name=node.name)
     context.add(res)
 
 
-@register_torch_op
+@register_torch_op(torch_alias=["add.tensor"])
 def add(context, node):
     add_inputs = _get_inputs(context, node)
     assert len(node.outputs) == 1
@@ -852,12 +878,12 @@ def addmm(context, node):
     # output = beta * input + alpha * mat1 * mat2
 
     assert len(node.outputs) == 1
-    inputs = _get_inputs(context, node, expected=5)
+    inputs = _get_inputs(context, node, expected=[3, 4, 5])
     bias = inputs[0]
     mat1 = inputs[1]
     mat2 = inputs[2]
-    beta = inputs[3]
-    alpha = inputs[4]
+    beta = inputs[3] if len(inputs) > 3 else mb.const(val=1.0)
+    alpha = inputs[4] if len(inputs) > 4 else mb.const(val=1.0)
 
     if beta.val != 1.0:
         # Apply scaling factor beta to the bias.
@@ -889,7 +915,7 @@ def linear(context, node):
     context.add(res, torch_name=node.name)
 
 
-@register_torch_op(torch_alias=["conv2d"])
+@register_torch_op(torch_alias=["conv2d", "convolution"])
 def _convolution(context, node):
     inputs = _get_inputs(context, node)
 
@@ -923,7 +949,7 @@ def _convolution(context, node):
 
     dilations = inputs[5]
     out_pad = None
-    if len(inputs) >= 12:
+    if len(inputs) >= 9:
         transposed = inputs[6].val
         out_pad = inputs[7].val
         group = inputs[8]
@@ -1042,7 +1068,7 @@ def _convolution_mode(context, node):
     )
 
 
-@register_torch_op
+@register_torch_op(torch_alias=["_softmax"])
 def softmax(context, node):
     inputs = _get_inputs(context, node)
 
@@ -1333,14 +1359,15 @@ def _max_pool(context, node, inputs):
         strides = mb.const(val=kernel_sizes.val, name=strides.name)
 
     pad_type = "custom"
-    # Need to explicitly state L-R, T-B pad
-    pad = inputs[3]
-    pad = _np.repeat(pad.val, 2)
-    dilation = inputs[4].val
-    ceil_mode = inputs[5].val
+
+    pad = np.array([0] * (kernel_sizes.shape[0] * 2)) if len(inputs) < 4 else _np.repeat(inputs[3].val, 2)
+    dilation = np.array([1] * kernel_sizes.shape[0]) if len(inputs) < 5 else inputs[4].val
+    ceil_mode = False if len(inputs) < 6 else inputs[5].val
+
     if _np.any(dilation > 1):
         # See: rdar://60633736 (Implement dilation for mil op max_pool)
         raise ValueError("@max_pool does not support dilation > 1")
+
     spatial_rank = len(pad) // 2
     if spatial_rank > 2 and ceil_mode is True and list(strides.val) != [1] * len(strides.val):
         # since MIL does not support ceil_mode for 3D pool,
@@ -1358,7 +1385,12 @@ def _max_pool(context, node, inputs):
         name=node.name,
         ceil_mode=ceil_mode if spatial_rank <= 2 else False,
     )
-    context.add(pool)
+
+    if node.kind == "max_pool2d_with_indices":
+        # TODO(rdar://117038432) ([Executorch] Handle/Bind other outputs of `max_pool2d_with_indices` op during lowering)
+        context.add((pool, None), torch_name=node.name)
+    else:
+        context.add(pool)
 
 
 @register_torch_op
@@ -1367,9 +1399,9 @@ def max_pool1d(context, node):
     _max_pool(context, node, inputs)
 
 
-@register_torch_op
+@register_torch_op(torch_alias=["max_pool2d_with_indices"])
 def max_pool2d(context, node):
-    inputs = _get_inputs(context, node, expected=6)
+    inputs = _get_inputs(context, node, min_expected=3)
     _max_pool(context, node, inputs)
 
 
@@ -1406,7 +1438,7 @@ def maximum(context, node):
     context.add(out)
 
 
-@register_torch_op
+@register_torch_op(torch_alias = ["div.tensor"])
 def div(context, node):
     inputs = _get_inputs(context, node, expected=[2, 3])
     x = mb.cast(x=inputs[0], dtype="fp32")
@@ -1457,7 +1489,7 @@ def true_divide(context, node):
     context.add(res)
 
 
-@register_torch_op
+@register_torch_op(torch_alias=["mul.tensor", "mul.scalar"])
 def mul(context, node):
     inputs = _get_inputs(context, node, expected=2)
     x, y = promote_input_dtypes(inputs)
@@ -1505,9 +1537,11 @@ def sub(context, node):
 
 @register_torch_op(
     torch_alias=[
+        "mean.dim",
         "sum",
         "logsumexp",
-    ])
+    ]
+)
 def mean(context, node):
     inputs = _get_inputs(context, node)
 
@@ -1546,18 +1580,22 @@ def mean(context, node):
     context.add(res)
 
 
-@register_torch_op
+@register_torch_op(torch_alias=["squeeze_copy.dim", "squeeze_copy.dims"])
 def squeeze(context, node):
     inputs = _get_inputs(context, node)
     if len(inputs) == 1:
         res = mb.squeeze(x=inputs[0], name=node.name)
     elif len(inputs) == 2:
-        squeeze_dim = inputs[1].val
-        res = mb.squeeze(x=inputs[0], axes=(squeeze_dim,), name=node.name)
+        dims = inputs[1].val
+        try:
+            dims = (int(dims),)
+        except:
+            pass
+        res = mb.squeeze(x=inputs[0], axes=dims, name=node.name)
     context.add(res)
 
 
-@register_torch_op
+@register_torch_op(torch_alias=["unsqueeze_copy"])
 def unsqueeze(context, node):
     inputs = _get_inputs(context, node, expected=2)
     unsqueeze = mb.expand_dims(x=inputs[0], axes=[inputs[1].val], name=node.name)
@@ -1591,7 +1629,7 @@ def _shape_as_tensor(context, node):
     context.add(shape_node, node.name)
 
 
-@register_torch_op(torch_alias=["reshape"])
+@register_torch_op(torch_alias=["view_copy", "reshape"])
 def view(context, node):
     inputs = _get_inputs(context, node, expected=2)
     x = inputs[0]
@@ -1602,9 +1640,8 @@ def view(context, node):
         indices = mb.range_1d(start=0, end=length, step=1)
         shape = mb.list_gather(ls=shape, indices=indices)
 
-    if (
-        isinstance(shape, list)
-        and all([isinstance(dim, Var) and len(dim.shape) == 0 for dim in shape])
+    if isinstance(shape, list) and all(
+        [isinstance(dim, Var) and len(dim.shape) == 0 for dim in shape]
     ):
         shape = mb.concat(values=shape, axis=0)
 
@@ -1739,25 +1776,39 @@ def _adaptive_pool2d(context, node, pool_op, reduce_op):
     context.add(result)
 
 
-@register_torch_op
+@register_torch_op(torch_alias=["_native_batch_norm_legit_no_training"])
 def batch_norm(context, node):
-    inputs = _get_inputs(context, node, expected=9)
-    # inputs skipped:
-    #   float momentum (6)
-    #   bool cudnn_enabled (8)
-    input_rank = inputs[0].rank
-    if input_rank < 2 or input_rank > 5:
-        raise ValueError(
-            "BatchNorm: Encountered invalid input rank during translation in torch frontend."
-        )
+    inputs = _get_inputs(context, node, expected=[7, 9])
 
     _input = inputs[0]
     weight = inputs[1]
     bias = inputs[2]
     running_mean = inputs[3]
     running_var = inputs[4]
-    training = inputs[5].val
-    eps = inputs[7]
+
+    if len(inputs) == 9:
+        # inputs skipped:
+        #   float momentum (6)
+        #   bool cudnn_enabled (8)
+
+        training = inputs[5].val
+        eps = inputs[7]
+    # no: training, cudnn_enabled
+    elif len(inputs) == 7:
+        # inputs skipped:
+        #   float momentum (5)
+        eps = inputs[6]
+
+        training = False
+    else:
+        raise ValueError(
+            f"BatchNorm: got {len(inputs)} inputs, expected 7 or 9"
+        )
+    input_rank = _input.rank
+    if input_rank < 2 or input_rank > 5:
+        raise ValueError(
+            "BatchNorm: Encountered invalid input rank during translation in torch frontend."
+        )
 
     # If training = True, the mean and variance of the current batch of data are used to normalize the input data.
     # If training = False, data statistics running_mean and running_var are used instead.
@@ -1797,7 +1848,7 @@ def _add_batch_norm_dynamic():
             bias_reshape = mb.reshape(x=bias, shape=shape)
             x = mb.add(x=x, y=bias_reshape, name=node.name)
 
-        context.add(x)
+        return x
 
     def _add_batch_norm_1d():
         # first expand the 3d tensor to 4d, and call the standard mb.batch_norm
@@ -1812,7 +1863,7 @@ def _add_batch_norm_1d():
             name=node.name + "_batch_norm_1d",
         )
         bn = mb.squeeze(x=bn, name=node.name, axes=[-1])
-        context.add(bn)
+        return bn
 
     def _add_batch_norm():
         bn = mb.batch_norm(
@@ -1824,16 +1875,22 @@ def _add_batch_norm():
             epsilon=eps,
             name=node.name,
         )
-        context.add(bn)
+        return bn
 
     is_batch_norm_1d_rank_2 = input_rank == 2
 
     if training or running_mean.val is None or running_var.val is None or weight is None or bias is None:
-        _add_batch_norm_dynamic()
+        bn = _add_batch_norm_dynamic()
     elif is_batch_norm_1d_rank_2:
-        _add_batch_norm_1d()
+        bn = _add_batch_norm_1d()
     else:
-        _add_batch_norm()
+        bn = _add_batch_norm()
+
+    if node.kind == "_native_batch_norm_legit_no_training":
+        # TODO(rdar://117038279) ([Executorch] Handle/Bind other outputs of `_native_batch_norm_legit_no_training` op during lowering)
+        bn = (bn, None, None)
+
+    context.add(bn, torch_name=node.name)
 
 
 @register_torch_op
@@ -1935,7 +1992,7 @@ def hardtanh(context, node):
     context.add(res)
 
 
-@register_torch_op(torch_alias=['concat'])
+@register_torch_op(torch_alias=["concat"])
 def cat(context, node):
     inputs = _get_inputs(context, node)
     axis = 0 if len(inputs) == 1 else inputs[1]
@@ -2013,14 +2070,10 @@ def _cast(context, node, dtype, dtype_name):
             res = mb.const(val=dtype(x.val), name=node.name)
         else:
             res = x
-    elif x.shape == (1,):
+    elif len(x.shape) > 0:
         x = mb.squeeze(x=x, name=node.name + "_item")
         res = mb.cast(x=x, dtype=dtype_name, name=node.name)
     else:
-        if len(x.shape) > 0:
-            # TODO: There's no MIL op to extract a value from a symbolic tensor,
-            # so as a workaround we use reduce_max to convert it to a scalar.
-            x = mb.reduce_max(x=x, name=node.name + "_item")
         res = mb.cast(x=x, dtype=dtype_name, name=node.name)
     context.add(res, node.name)
 
@@ -2035,9 +2088,9 @@ def _int(context, node):
     _cast(context, node, int, "int32")
 
 
-@register_torch_op
+@register_torch_op(torch_alias=["native_layer_norm"])
 def layer_norm(context, node):
-    inputs = _get_inputs(context, node, expected=6)
+    inputs = _get_inputs(context, node, min_expected=5)
     _input = inputs[0]
     normalized_shape = inputs[1]
     weight = inputs[2]
@@ -2053,7 +2106,12 @@ def layer_norm(context, node):
         epsilon=eps,
         name=node.name,
     )
-    context.add(layer_norm)
+
+    if node.kind == "native_layer_norm":
+        # TODO(rdar://117038370) ([Executorch] Handle/Bind other outputs of `native_layer_norm` op during lowering)
+        context.add((layer_norm, None, None), torch_name=node.name)
+    else:
+        context.add(layer_norm)
 
 
 @register_torch_op
@@ -3281,7 +3339,7 @@ def _false_path():
         context.add(output_var, torch_name=output_name)
 
 
-@register_torch_op
+@register_torch_op(torch_alias=["select_copy.int"])
 def select(context, node):
     inputs = _get_inputs(context, node, expected=3)
     _input = inputs[0]
@@ -3311,7 +3369,7 @@ def select(context, node):
     squeeze_mask[dim] = True
 
     if index.val != -1:
-        if  index.val is None:
+        if index.val is None:
             # index value not know till runtime
             temp = mb.add(x=index, y=1)
             end_array[dim] = temp
@@ -3331,6 +3389,33 @@ def select(context, node):
     context.add(slice_by_index)
 
 
+@register_torch_op
+def getitem(context, node):
+    inputs = _get_inputs(context, node, expected=2)
+
+    if not isinstance(inputs[0], (list, tuple)):
+        raise AssertionError("Item selection is supported only on python list/tuple objects")
+
+    if inputs[1].val is None:
+        raise AssertionError("Only static item selection supported")
+
+    try:
+        index = int(inputs[1].val)
+    except:
+        raise AssertionError(
+            f"Index into python list/tuple needs to be integer. Provided value: {inputs[1].val}"
+        )
+
+    out = inputs[0][index]
+
+    if out is None:
+        raise AssertionError(
+            f"coremltools lowering didn't handle/bind value at index {index}. Please inspect the lowering of parent op for its return value"
+        )
+
+    context.add(out, torch_name=node.name)
+
+
 @register_torch_op
 def type_as(context, node):
     inputs = _get_inputs(context, node, expected=2)
@@ -3357,6 +3442,20 @@ def nonzero(context, node):
 
 
 def _get_slice_params(context, data, inputs):
+    def _expand_list_to_rank_1(arr):
+        """
+        We make the elements in begin and end rank 1,
+        so the pattern of ``squeeze -> expand_dims`` can be removed
+        by the ``fuse_squeeze_expand_dims`` graph pass.
+        """
+        for i, val in enumerate(arr):
+            if isinstance(val, Var):
+                if val.rank == 0:
+                    arr[i] = mb.expand_dims(x=val, axes=[0])
+            else:
+                arr[i] = np.array([val])
+        return arr
+
     rank = data.rank
     begin = [0] * rank
     end = [0] * rank
@@ -3400,12 +3499,38 @@ def _get_slice_params(context, data, inputs):
         begin_mask[i] = True
         end_mask[i] = True
 
+    begin = _expand_list_to_rank_1(begin)
+    eng = _expand_list_to_rank_1(end)
     begin = mb.concat(values=begin, axis=0)
     end = mb.concat(values=end, axis=0)
 
     return begin, end, stride, begin_mask, end_mask, squeeze_mask
 
 
+def _translate_torch_tensor_assign(
+    x,
+    updates,
+    begin,
+    end,
+    stride,
+    begin_mask,
+    end_mask,
+    squeeze_mask,
+    name,
+):
+    return mb.torch_tensor_assign(
+        x=x,
+        updates=updates,
+        begin=begin,
+        end=end,
+        stride=stride,
+        begin_mask=begin_mask,
+        end_mask=end_mask,
+        squeeze_mask=squeeze_mask,
+        name=name,
+    )
+
+
 @register_torch_op
 def _internal_op_tensor_inplace_copy(context, node):
     data = context[node.inputs[0]]
@@ -3415,8 +3540,8 @@ def _internal_op_tensor_inplace_copy(context, node):
     )
 
     data, updates = promote_input_dtypes([data, updates])
-    updated_x = mb.torch_tensor_assign(
-        data=data,
+    updated_x = _translate_torch_tensor_assign(
+        x=data,
         updates=updates,
         begin=begin,
         end=end,
@@ -3456,8 +3581,9 @@ def _internal_op_tensor_inplace_fill(context, node):
     update_values = _np.full(fill_shape, fill_scalar.val)
 
     data, update_values = promote_input_dtypes([data, update_values])
-    updated_x = mb.torch_tensor_assign(
-        data=data,
+
+    updated_x = _translate_torch_tensor_assign(
+        x=data,
         updates=update_values,
         begin=begin,
         end=end,
@@ -3916,8 +4042,8 @@ def avg_pool1d(context, node):
 
 @register_torch_op
 def avg_pool2d(context, node):
-    inputs = _get_inputs(context, node, expected=7)
-    divisor_override = inputs[6]
+    inputs = _get_inputs(context, node, min_expected=6)
+    divisor_override = None if len(inputs) < 7 else inputs[6]
     if divisor_override is not None:
         raise ValueError("divisor_override is not supported for avg_pool2d")
     _avg_pool(context, node, inputs)
@@ -3932,14 +4058,17 @@ def avg_pool3d(context, node):
     _avg_pool(context, node, inputs)
 
 
-@register_torch_op
+@register_torch_op(torch_alias=["_log_softmax"])
 def log_softmax(context, node):
     inputs = _get_inputs(context, node)
 
     x = inputs[0]
     axis = inputs[1]
-    out = inputs[2]  # Ignored.
-    assert out is None
+
+    # input 2 is either out or half_to_float, so we ignore
+    ignored = inputs[2]
+    assert ignored is None or ignored.dtype == types.bool
+
     res = mb.softmax(x=x, axis=axis, name=node.name + "_softmax")
     res = mb.log(x=res, name=node.name)
     context.add(res)
@@ -4126,7 +4255,7 @@ def unbind(context, node):
     context.add(res, torch_name=node.name)
 
 
-@register_torch_op
+@register_torch_op(torch_alias = ["_to_copy"])
 def to(context, node):
     inputs = _get_inputs(context, node)
 
@@ -4138,13 +4267,14 @@ def to(context, node):
     # - When len(inputs) == 3, the parameter is (input, non_blocking, copy)
     # We only use `input` and `dtype`, and `non_blocking` and `copy` are unused.
     _input = inputs[0]
+
     target_dtype: Optional[Var]
     inputs_len = len(inputs)
     if inputs_len in (4, 5, 7, 8):
         target_dtype = inputs[1]
     elif inputs_len == 6:
         target_dtype = inputs[2]
-    elif inputs_len == 3:
+    elif inputs_len <= 3:
         target_dtype = None
     else:
         raise ValueError(
@@ -4242,7 +4372,7 @@ def _broadcast(name, tensor, shape):
     return res
 
 
-@register_torch_op
+@register_torch_op(torch_alias=["expand_copy"])
 def expand(context, node):
     def _broadcast_dynamic(name, tensor, shape):
         # Add any extra dimensions
@@ -4406,7 +4536,6 @@ def meshgrid(context, node):
         "detach",
         "device",
         "dropout",
-        "dropout_",
         "feature_dropout",
         "lift_fresh",
     ]
@@ -4435,9 +4564,12 @@ def argmax(context, node):
 def zeros_like(context, node):
     inputs = _get_inputs(context, node, expected=6)
     x = inputs[0]
-    dtype = inputs[1].val
     shape = mb.shape(x=x)
-    np_type = NUM_TO_NUMPY_DTYPE[dtype]
+    if inputs[1] and inputs[1].val:
+        dtype = inputs[1].val
+        np_type = NUM_TO_NUMPY_DTYPE[dtype]
+    else:
+        np_type = nptype_from_builtin(x.dtype)
 
     if shape.can_be_folded_to_const():
         shape = shape.val
@@ -4836,10 +4968,10 @@ def ceil(context, node):
 
 @register_torch_op
 def clamp(context, node):
-    inputs = _get_inputs(context, node, expected=3)
+    inputs = _get_inputs(context, node, expected=[1,2,3])
     x = inputs[0]
-    min_val = inputs[1] if inputs[1] else _np.finfo(_np.float32).min
-    max_val = inputs[2] if inputs[2] else _np.finfo(_np.float32).max
+    min_val = inputs[1] if (len(inputs) > 1 and inputs[1]) else mb.const(val=_np.finfo(_np.float32).min)
+    max_val = inputs[2] if (len(inputs) > 2 and inputs[2]) else mb.const(val=_np.finfo(_np.float32).max)
 
     if isinstance(min_val, Var) and isinstance(max_val, Var) and min_val.val >= max_val.val:
         # When min >= max, PyTorch sets all values to max.
@@ -6153,51 +6285,77 @@ def tupleindex(context, node):
     context.add(tuple_input[index_input.val], node.name)
 
 
-def _get_attn_mask(is_causal: Var, attn_mask: Var, query_var: Var, key_var: Var) -> Var:
-    if is_causal.val:
-        # create mask of shape (target_seq, source_seq)
-        # s.t the diagonal and lower triangular of the matrix is all 1s
-        # and upper triangular is a large negative number (e.g. -30k)
-        target_seq = query_var.shape[-2]
-        source_seq = key_var.shape[-2]
-        if is_symbolic(target_seq) or is_symbolic(source_seq):
-            raise NotImplementedError(
-                "scaled_dot_product_attention op: "
-                "is_causal flag not handled when sequence length is symbolic"
-            )
+def _get_causal_attn_mask(is_causal: bool, query_var: Var, key_var: Var) -> Var:
+    assert is_causal
 
-        all_ones = mb.fill(value=1.0, shape=(target_seq, source_seq))
-        all_negative_inf = mb.fill(value=-3e4, shape=(target_seq, source_seq))
-        all_ones_lower = mb.band_part(
-            x=all_ones, lower=-1, upper=0
-        )  # will 0 out upper triangle, excluding diag
-        all_negative_inf_upper = mb.band_part(
-            x=all_negative_inf, lower=0, upper=-1
-        )  # will 0 out lower triangle, excluding diag
-        all_negative_inf_diag_only = mb.band_part(x=all_negative_inf_upper, lower=0, upper=0)
-        all_negative_inf_upper_no_diag = mb.sub(
-            x=all_negative_inf_upper, y=all_negative_inf_diag_only
-        )
-        return mb.add(x=all_ones_lower, y=all_negative_inf_upper_no_diag)
-    elif is_bool(attn_mask.dtype):
-        """
-        compute float mask as:
-        mask = cast(bool_mask) + (1-cast(bool_mask)) * -30k*ones(shape(bool_mask))
-        """
-        shape = mb.shape(x=attn_mask)
-        negative_inf = mb.fill(
-            shape=shape, value=_np.array([-3e4]).astype(types.nptype_from_builtin(query_var.dtype))
+    # create mask of shape (target_seq, source_seq)
+    # s.t the diagonal and lower triangular of the matrix is all 1s
+    # and upper triangular is a large negative number (e.g. -30k)
+    target_seq = query_var.shape[-2]
+    source_seq = key_var.shape[-2]
+    if is_symbolic(target_seq) or is_symbolic(source_seq):
+        raise NotImplementedError(
+            "scaled_dot_product_attention op: "
+            "is_causal flag not handled when sequence length is symbolic"
         )
-        mask = mb.cast(x=attn_mask, dtype=types.builtin_to_string(query_var.dtype))
-        compliment_of_mask = mb.sub(
-            x=_np.array([1.0]).astype(types.nptype_from_builtin(mask.dtype)), y=mask
+
+    all_ones = mb.fill(value=1.0, shape=(target_seq, source_seq))
+    all_negative_inf = mb.fill(value=-3e4, shape=(target_seq, source_seq))
+    all_ones_lower = mb.band_part(
+        x=all_ones, lower=-1, upper=0
+    )  # will 0 out upper triangle, excluding diag
+    all_negative_inf_upper = mb.band_part(
+        x=all_negative_inf, lower=0, upper=-1
+    )  # will 0 out lower triangle, excluding diag
+    all_negative_inf_diag_only = mb.band_part(x=all_negative_inf_upper, lower=0, upper=0)
+    all_negative_inf_upper_no_diag = mb.sub(x=all_negative_inf_upper, y=all_negative_inf_diag_only)
+    return mb.add(x=all_ones_lower, y=all_negative_inf_upper_no_diag)
+
+
+def _cast_bool_attn_mask(attn_mask: Var, query_var: Var) -> Var:
+    """
+    compute float mask as:
+    mask = cast(bool_mask) + (1-cast(bool_mask)) * -30k*ones(shape(bool_mask))
+    """
+    assert is_bool(attn_mask.dtype)
+
+    shape = mb.shape(x=attn_mask)
+    negative_inf = mb.fill(
+        shape=shape, value=_np.array([-3e4]).astype(types.nptype_from_builtin(query_var.dtype))
+    )
+    mask = mb.cast(x=attn_mask, dtype=types.builtin_to_string(query_var.dtype))
+    compliment_of_mask = mb.sub(
+        x=_np.array([1.0]).astype(types.nptype_from_builtin(mask.dtype)), y=mask
+    )
+    compliment_of_mask = mb.mul(x=negative_inf, y=compliment_of_mask)
+    return mb.add(x=mask, y=compliment_of_mask)
+
+
+def _lower_scaled_dot_product_attention(q: Var, k: Var, v: Var, mask: Var, name: str) -> Var:
+    # scale the query input
+    embed_size = q.shape[-1]
+    if is_symbolic(embed_size):
+        raise ValueError(
+            "The embedding size, i.e. last dimension of the shape of query tensor"
+            " cannot be symbolic, in scaled_dot_product_attention op"
         )
-        compliment_of_mask = mb.mul(x=negative_inf, y=compliment_of_mask)
-        return mb.add(x=mask, y=compliment_of_mask)
-    else:
-        return attn_mask
+    multiplicative_scale_factor = 1 / _math.sqrt(embed_size)
+    q = mb.mul(x=q, y=multiplicative_scale_factor)
 
+    # multiply query and key input tensors
+    # shape of output: (target_seq, source_seq) or (B,...,target_seq, source_seq)
+    attn_weights = mb.matmul(x=q, y=k, transpose_y=True)
+
+    # add mask if applicable
+    if mask is not None:
+        attn_weights = mb.add(x=attn_weights, y=mask)
+
+    # do softmax
+    attn_weights_normalized = mb.softmax(x=attn_weights, axis=-1)
 
+    # multiply attn_weights and value tensor
+    res = mb.matmul(x=attn_weights_normalized, y=v, name=name)
+    return res
 
 @register_torch_op
 def scaled_dot_product_attention(context, node):
@@ -6214,14 +6372,22 @@ def scaled_dot_product_attention(context, node):
 
     output = softmax(scale*Q*K^transpose + mask) * V
 
+    Currently, Core ML does not support dropout, so it has to be either None or 0
+
     See details at:
     https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
     """
-    q, k, v, attn_mask, dropout, is_causal = _get_inputs(context, node, expected=6)
-    if attn_mask is not None and is_causal.val:
+    inputs = _get_inputs(context, node, min_expected=3)
+    q, k, v = inputs[: 3]
+    attn_mask = None if len(inputs) < 4 else inputs[3]
+    dropout = 0.0 if len(inputs) < 5 else inputs[4]
+    is_causal = False if len(inputs) < 6 else inputs[5].val
+    if attn_mask is not None and is_causal:
         raise ValueError(
             "scaled_dot_product_attention op: attn_mask cannot be provided when is_causal is set to True."
         )
+    if dropout is not None and (dropout.val is None or dropout.val != 0.0):
+        raise ValueError("scaled_dot_product_attention op: dropout is not supported yet")
 
     # check that ranks of q, k, v and attn_mask match
     if k.rank != q.rank:
@@ -6233,34 +6399,16 @@ def scaled_dot_product_attention(context, node):
             "Rank of query and value do not match in scaled_dot_product_attention torch op"
         )
 
-    is_mask_present = False
-    if is_causal.val or attn_mask is not None:
-        is_mask_present = True
-        mask = _get_attn_mask(is_causal, attn_mask, q, k)
-
-    # scale the query input
-    embed_size = q.shape[-1]
-    if is_symbolic(embed_size):
-        raise ValueError(
-            "The embedding size, i.e. last dimension of the shape of query tensor"
-            " cannot be symbolic, in scaled_dot_product_attention op"
-        )
-    multiplicative_scale_factor = 1 / _math.sqrt(embed_size)
-    q = mb.mul(x=q, y=multiplicative_scale_factor)
-
-    # multiply query and key input tensors
-    # shape of output: (target_seq, source_seq) or (B,...,target_seq, source_seq)
-    attn_weights = mb.matmul(x=q, y=k, transpose_y=True)
-
-    # add mask if applicable
-    if is_mask_present:
-        attn_weights = mb.add(x=attn_weights, y=mask)
-
-    # do softmax
-    attn_weights_normalized = mb.softmax(x=attn_weights, axis=-1)
+    mask = None
+    if is_causal:
+        mask = _get_causal_attn_mask(is_causal, q, k)
+    elif attn_mask is not None:
+        if is_bool(attn_mask.dtype):
+            mask = _cast_bool_attn_mask(attn_mask, q)
+        else:
+            mask = attn_mask
 
-    # multiply attn_weights and value tensor
-    res = mb.matmul(x=attn_weights_normalized, y=v, name=node.name)
+    res = _lower_scaled_dot_product_attention(q, k, v, mask, node.name)
     context.add(res)
 
 
@@ -6276,3 +6424,16 @@ def fliplr(context, node):
     x = _get_inputs(context, node, expected=1)[0]
     res = mb.reverse(x=x, axes=[1], name=node.name)
     context.add(res)
+
+
+@register_torch_op
+def multinomial(context, node):
+    x = context[node.inputs[0]]
+    num_samples = context[node.inputs[1]].val
+    replacement = context[node.inputs[2]].val
+    if num_samples is None:
+        raise ValueError("In torch.multinomial op, num_samples must be const")
+    if num_samples > 1 and not replacement:
+        raise ValueError("When num_samples is larger than 1, only replacement=True is supported.")
+    x = mb.random_categorical(x=x, size=num_samples, name=node.name)
+    context.add(x)
diff --git a/coremltools/converters/mil/frontend/torch/quantization_ops.py b/coremltools/converters/mil/frontend/torch/quantization_ops.py
index dd320c75e..2b1389ce4 100644
--- a/coremltools/converters/mil/frontend/torch/quantization_ops.py
+++ b/coremltools/converters/mil/frontend/torch/quantization_ops.py
@@ -95,7 +95,23 @@ def dequantize(context, node):
     context.quant_context.get_dequantized_var(node.inputs[0], node.name)
 
 
-def _dequantized_weight(qweight, name:str = None):
+def _construct_constexpr_affine_op(quantized_weights, zero_point, scale, axis=None, name=None):
+    """Constructs the constexpr op to represent the dequantized weight from PyTorch's data."""
+    if axis is None:
+        # It's per-tensor quantization, just use a dummy value for axis.
+        axis = _np.int32(0)
+    kwargs = {
+        "quantized_data": quantized_weights,
+        "zero_point": zero_point,
+        "scale": scale,
+        "axis": axis,
+    }
+    if name is not None:
+        kwargs["name"] = name
+    return mb.constexpr_affine_dequantize(**kwargs)
+
+
+def _dequantized_weight(qweight, name: str = None):
     """
     Given the first output (qweight) of torch.ops.quantized.conv2d/linear_unpack,
     this returns a dequantized version of the tensor to be added to the context.
@@ -105,24 +121,15 @@ def _dequantized_weight(qweight, name:str = None):
         scale = _np.float32(qweight.q_scale())
         zero_point = quant_dtype_np(qweight.q_zero_point())
         quantized_weights = _torch.int_repr(qweight).numpy()
-        # Axis doesn't matter for per-tensor quantization.
-        axis = _np.int32(0)
-        kwargs = {
-            "quantized_data": quantized_weights,
-            "zero_point": zero_point,
-            "scale": scale,
-            "axis": axis,
-        }
-        if name is not None:
-            kwargs["name"] = name
-        dequant_weights = mb.constexpr_affine_dequantize(**kwargs)
+        dequant_weights = _construct_constexpr_affine_op(
+            quantized_weights, zero_point, scale, axis=None, name=name
+        )
     # per_channel_affine_float_qparams is same as per_channel_affine except that it
     # expects both scale and zero point to be floating point values.
     elif qweight.qscheme() in {_torch.per_channel_affine, _torch.per_channel_affine_float_qparams}:
         quant_dtype_np = TORCH_QTYPE_TO_NP_TYPE[qweight.dtype]
         # TODO: How do we set the appropriate dtype here (fp16/fp32)?
         scale = qweight.q_per_channel_scales().numpy()
-        zero_point = quant_dtype_np(qweight.q_per_channel_zero_points().numpy())
         if qweight.qscheme() == _torch.per_channel_affine:
             zero_point = quant_dtype_np(qweight.q_per_channel_zero_points().numpy())
         else:
@@ -139,17 +146,10 @@ def _dequantized_weight(qweight, name:str = None):
             )
             zero_point = quant_dtype_np(val)
         quantized_weights = _torch.int_repr(qweight).numpy()
-        # Axis doesn't matter for per-tensor quantization.
-        axis = _np.int32(0)
-        kwargs = {
-            "quantized_data": quantized_weights,
-            "zero_point": zero_point,
-            "scale": scale,
-            "axis": axis,
-        }
-        if name is not None:
-            kwargs["name"] = name
-        dequant_weights = mb.constexpr_affine_dequantize(**kwargs)
+        axis = _np.int32(qweight.q_per_channel_axis())
+        dequant_weights = _construct_constexpr_affine_op(
+            quantized_weights, zero_point, scale, axis=axis, name=name
+        )
     else:
         raise ValueError(f'Unsupported quant scheme "{qweight.qscheme()}"')
     return dequant_weights
diff --git a/coremltools/converters/mil/frontend/torch/ssa_passes/torch_tensor_assign_to_core.py b/coremltools/converters/mil/frontend/torch/ssa_passes/torch_tensor_assign_to_core.py
index a24a31c36..297a44135 100644
--- a/coremltools/converters/mil/frontend/torch/ssa_passes/torch_tensor_assign_to_core.py
+++ b/coremltools/converters/mil/frontend/torch/ssa_passes/torch_tensor_assign_to_core.py
@@ -31,7 +31,7 @@ def _torch_tensor_assign_to_core_block(block):
 
 
 def _transform_tensor_assign(op, block):
-    shape = mb.shape(x=op.data, before_op=op)
+    shape = mb.shape(x=op.x, before_op=op)
     dim_prod = mb.reduce_prod(x=shape, before_op=op)
     ref_indices = mb.range_1d(end=dim_prod, start=0, step=1, before_op=op)
     ref_indices = mb.reshape(x=ref_indices, shape=shape, before_op=op)
@@ -47,18 +47,18 @@ def _transform_tensor_assign(op, block):
                         )
     flatten_indices = mb.reshape(x=ref_sliced_indices, shape=[-1], before_op=op)
     flatten_updates = mb.reshape(x=op.updates, shape=[-1], before_op=op)
-    flatten_data = mb.reshape(x=op.data, shape=[-1], before_op=op)
+    flatten_data = mb.reshape(x=op.x, shape=[-1], before_op=op)
     new_data = mb.scatter(
-                data=flatten_data, 
-                indices=flatten_indices, 
-                updates=flatten_updates, 
-                mode="update", 
-                before_op=op
-            )
+        data=flatten_data,
+        indices=flatten_indices,
+        updates=flatten_updates,
+        mode="update",
+        before_op=op,
+    )
     new_data = mb.reshape(x=new_data, shape=shape, before_op=op)
 
     op.enclosing_block.replace_uses_of_var_after_op(
         anchor_op=op, old_var=op.outputs[0], new_var=new_data
     )
     # Remove all the ops at once
-    block.remove_ops([op])
\ No newline at end of file
+    block.remove_ops([op])
diff --git a/coremltools/converters/mil/frontend/torch/test/test_api.py b/coremltools/converters/mil/frontend/torch/test/test_api.py
deleted file mode 100644
index 4a3e0cf51..000000000
--- a/coremltools/converters/mil/frontend/torch/test/test_api.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (c) 2021, Apple Inc. All rights reserved.
-#
-# Use of this source code is governed by a BSD-3-clause license that can be
-# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-
-import os
-
-import pytest
-import torch
-import torchvision
-
-import coremltools as ct
-from coremltools._deps import _HAS_TORCH, MSG_TORCH_NOT_FOUND
-from coremltools.converters.mil.testing_reqs import backends
-
-if _HAS_TORCH:
-    import torch
-    import torchvision
-
-
-@pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
-class TestPyTorchConverter:
-    @staticmethod
-    @pytest.mark.parametrize(
-        "backend",
-        backends,
-    )
-    def test_no_inputs(backend):
-        model = torchvision.models.mobilenet_v2()
-        model.eval()
-
-        example_input = torch.rand(1, 3, 256, 256)
-
-        traced_model = torch.jit.trace(model, example_input)
-
-        with pytest.raises(ValueError) as e:
-            ct.convert(traced_model, convert_to=backend[0])
-        e.match(r'Expected argument for pytorch "inputs" not provided')
-
-
-    @staticmethod
-    @pytest.mark.parametrize(
-        "backend",
-        backends,
-    )
-    def test_pth_extension(tmpdir, backend):
-        # test for issue: https://github.com/apple/coremltools/issues/917
-        class TestModule(torch.nn.Module):
-            def __init__(self):
-                super(TestModule, self).__init__()
-                self.linear = torch.nn.Linear(10, 20)
-
-            def forward(self, x):
-                return self.linear(x)
-
-        model = TestModule()
-        model.eval()
-        example_input = torch.rand(1, 10)
-        traced_model = torch.jit.trace(model, example_input)
-        model_path = os.path.join(str(tmpdir), "torch_model.pth")
-        traced_model.save(model_path)
-
-        ct.convert(
-            model_path,
-            source='pytorch',
-            inputs=[
-                ct.TensorType(
-                    shape=example_input.shape,
-                )
-            ],
-            convert_to=backend[0],
-        )
diff --git a/coremltools/converters/mil/frontend/torch/test/test_custom_ops.py b/coremltools/converters/mil/frontend/torch/test/test_custom_ops.py
index d8e266c98..799d753ff 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_custom_ops.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_custom_ops.py
@@ -28,7 +28,7 @@
 
 
 # Log Converter supported Cosine Similarity conversion function
-default_cosine_similarity = _TORCH_OPS_REG.get("cosine_similarity", None)
+default_cosine_similarity = _TORCH_OPS_REG.get_func("cosine_similarity")
 
 
 @register_torch_op(override=True)
@@ -37,11 +37,11 @@ def cosine_similarity(context, node):
 
 
 # Log custom Cosine Similarity conversion function
-custom_cosine_similarity = _TORCH_OPS_REG["cosine_similarity"]
+custom_cosine_similarity = _TORCH_OPS_REG.get_func("cosine_similarity")
 
 
 def _set_torch_reg_op(op_type, op_func):
-    _TORCH_OPS_REG[op_type] = op_func
+    _TORCH_OPS_REG.set_func_by_name(op_func, op_type)
 
 
 class TestCompositeOp(TorchBaseTest):
@@ -69,7 +69,7 @@ class custom_torch_sparse_matmul(Operation):
             x_is_sparse=TensorInputType(const=True, optional=True, type_domain=types.bool),
             y_is_sparse=TensorInputType(const=True, optional=True, type_domain=types.bool),
         )
-        
+
         type_domains = {
             "T": (types.fp16, types.fp32),
         }
diff --git a/coremltools/converters/mil/frontend/torch/test/test_executorch_e2e.py b/coremltools/converters/mil/frontend/torch/test/test_executorch_e2e.py
new file mode 100644
index 000000000..14627cf1a
--- /dev/null
+++ b/coremltools/converters/mil/frontend/torch/test/test_executorch_e2e.py
@@ -0,0 +1,158 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import pytest
+
+from coremltools._deps import _HAS_EXECUTORCH, _HAS_TORCH_VISION
+
+if not (_HAS_EXECUTORCH and _HAS_TORCH_VISION):
+    pytest.skip(allow_module_level=True, reason="executorch and torchvision are required")
+
+import torch
+import torchvision
+import torchaudio
+
+import timm
+
+from .testing_utils import TorchBaseTest, TorchFrontend
+
+
+class TestExecutorch(TorchBaseTest):
+    def test_mul(self):
+        class MulModule(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, input, other):
+                return input * other
+
+        model = MulModule()
+        model.eval()
+
+        self.run_compare_torch(
+            [(3, 2), (3, 2)],
+            model,
+            frontend=TorchFrontend.EDGEIR,
+        )
+
+    def test_linear(self):
+        class LinearModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(3, 3)
+
+            def forward(self, arg):
+                return self.linear(arg)
+
+        model = LinearModule()
+        model.eval()
+
+        self.run_compare_torch(
+            [(3, 3)], model, frontend=TorchFrontend.EDGEIR, backend=("mlprogram", "fp16")
+        )
+
+    def test_add(self):
+        class AddModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                z = x + y
+                z = z + x
+                z = z + x
+                z = z + z
+                return z
+
+        model = AddModule()
+        model.eval()
+
+        self.run_compare_torch(
+            [(1,), (1,)], model, frontend=TorchFrontend.EDGEIR, backend=("mlprogram", "fp16")
+        )
+
+    def test_add_mul(self):
+        class AddMulModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, a, x, b):
+                y = torch.mm(a, x)
+                z = torch.add(y, b)
+                return z
+
+        model = AddMulModule()
+        model.eval()
+
+        self.run_compare_torch(
+            [(2, 2), (2, 2), (2, 2)],
+            model,
+            frontend=TorchFrontend.EDGEIR,
+            backend=("mlprogram", "fp16"),
+        )
+
+    def test_mobilenet_v2(self):
+        model = torchvision.models.mobilenet_v2(weights=torchvision.models.mobilenetv2.MobileNet_V2_Weights.DEFAULT)
+        model.eval()
+
+        self.run_compare_torch(
+            [(1, 3, 224, 224)], model, frontend=TorchFrontend.EDGEIR, backend=("mlprogram", "fp16")
+        )
+
+    def test_mobilenet_v3(self):
+        model = torchvision.models.mobilenet_v3_small(pretrained=True)
+        model.eval()
+
+        self.run_compare_torch(
+            [(1, 3, 224, 224)], model, frontend=TorchFrontend.EDGEIR, backend=("mlprogram", "fp16")
+        )
+
+    def test_vit(self):
+        model = torchvision.models.vit_b_16(weights="IMAGENET1K_V1")
+        model.eval()
+
+        self.run_compare_torch(
+            [(1, 3, 224, 224)], model, frontend=TorchFrontend.EDGEIR, backend=("mlprogram", "fp16")
+        )
+
+    def test_wav2letter(self):
+        model = torchaudio.models.Wav2Letter(num_classes=4096)
+        model.eval()
+
+        self.run_compare_torch(
+            [(10, 1, 700)], model, frontend=TorchFrontend.EDGEIR, backend=("mlprogram", "fp16")
+        )
+
+    @pytest.mark.xfail(reason="Nodes of type get_attr not yet implemented")
+    def test_inception_v3(self):
+        model = torchvision.models.inception_v3(weights="IMAGENET1K_V1")
+        model.eval()
+
+        self.run_compare_torch(
+            [(1, 3, 224, 224)], model, frontend=TorchFrontend.EDGEIR, backend=("mlprogram", "fp16")
+        )
+
+    def test_inception_v4(self):
+        model = timm.models.inception_v4(pretrained=True)
+        model.eval()
+
+        self.run_compare_torch(
+            [(1, 3, 299, 299)], model, frontend=TorchFrontend.EDGEIR, backend=("mlprogram", "fp16")
+        )
+
+    def test_resnet18(self):
+        model = torchvision.models.resnet18(weights=torchvision.models.ResNet18_Weights.IMAGENET1K_V1)
+        model.eval()
+
+        self.run_compare_torch(
+            [(1, 3, 224, 224)], model, frontend=TorchFrontend.EDGEIR, backend=("mlprogram", "fp16")
+        )
+
+    def test_resnet50(self):
+        model = torchvision.models.resnet50(weights=torchvision.models.ResNet50_Weights.IMAGENET1K_V1)
+        model.eval()
+
+        self.run_compare_torch(
+            [(1, 3, 224, 224)], model, frontend=TorchFrontend.EDGEIR, backend=("mlprogram", "fp16")
+        )
diff --git a/coremltools/converters/mil/frontend/torch/test/test_internal_graph.py b/coremltools/converters/mil/frontend/torch/test/test_internal_graph.py
index d2c3fdb8b..82d226adf 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_internal_graph.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_internal_graph.py
@@ -523,7 +523,7 @@ def test_permute(self, context, input_shape):
             kind="Permute", inputs=input_list, outputs=[output_name],
         )
         ssa = self._construct_test_graph(
-            context, ops.permute, permute_node, output_name, constants=constants
+            context, ops.permute_copy, permute_node, output_name, constants=constants
         )
         expected_result = test_data.permute(*permutation)
         assert expected_result.shape == ssa.shape
@@ -1483,7 +1483,7 @@ def test_erf(self, context):
             context, ops.erf, node, output_name, constants=constants
         )
         expected_result = test_input.erf()
-        assert np.allclose(expected_result, ssa.val)
+        assert np.allclose(expected_result, ssa.val, atol=1e-05)
 
     def test_implicittensortonum(self, context):
         input_shape = (1,)
diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_conversion_api.py b/coremltools/converters/mil/frontend/torch/test/test_torch_conversion_api.py
index 487db8256..be62a74bb 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_conversion_api.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_conversion_api.py
@@ -11,8 +11,17 @@
 from PIL import Image
 
 import coremltools as ct
-from coremltools._deps import _HAS_TORCH, MSG_TORCH_NOT_FOUND
+from coremltools._deps import (
+    _HAS_EXECUTORCH,
+    _HAS_TORCH,
+    MSG_EXECUTORCH_NOT_FOUND,
+    MSG_TORCH_NOT_FOUND,
+)
 from coremltools.converters.mil.frontend.torch.test.testing_utils import _copy_input_data
+from coremltools.converters.mil.frontend.torch.torch_op_registry import (
+    _TORCH_OPS_REGISTRY,
+    register_torch_op,
+)
 from coremltools.converters.mil.testing_reqs import backends
 from coremltools.converters.mil.testing_utils import (
     assert_cast_ops_count,
@@ -26,6 +35,7 @@
     get_op_types_in_program,
     verify_prediction,
 )
+from coremltools.models import _METADATA_SOURCE_DIALECT
 from coremltools.proto import FeatureTypes_pb2 as ft
 from coremltools.test.api.test_api_examples import TestInputs as _TestInputs
 
@@ -35,9 +45,228 @@
 
     torch.manual_seed(1818)
 
+if _HAS_EXECUTORCH:
+    from executorch import exir
+
+    _CAPTURE_CONFIG = exir.CaptureConfig(enable_aot=True)
+    _EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig(
+        _check_ir_validity=False,
+    )
+
+
+@pytest.fixture
+def torch_model():
+    class TestModule(torch.nn.Module):
+        def __init__(self):
+            super(TestModule, self).__init__()
+            self.linear = torch.nn.Linear(10, 20)
+
+        def forward(self, x):
+            return self.linear(x)
+
+    model = TestModule()
+    model.eval()
+    return model
+
+
+@pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
+class TestTorchScriptValidation:
+    @staticmethod
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_no_inputs(torch_model, backend):
+
+        traced_torch_model = torch.jit.trace(torch_model, torch.rand(1, 10))
+        with pytest.raises(
+            ValueError, match=r'Expected argument "inputs" for TorchScript models not provided'
+        ):
+            ct.convert(traced_torch_model, convert_to=backend[0])
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_pth_extension(torch_model, tmpdir, backend):
+        # test for issue: https://github.com/apple/coremltools/issues/917
+
+        shape = (1, 10)
+        traced_torch_model = torch.jit.trace(torch_model, torch.rand(*shape))
+
+        model_path = os.path.join(str(tmpdir), "torch_model.pth")
+        traced_torch_model.save(model_path)
+
+        ct.convert(
+            model_path,
+            source="pytorch",
+            inputs=[
+                ct.TensorType(
+                    shape=shape,
+                )
+            ],
+            convert_to=backend[0],
+        )
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_source_dialect_metadata(torch_model, backend):
+        shape = (1, 10)
+        traced_torch_model = torch.jit.trace(torch_model, torch.rand(*shape))
+
+        mlmodel = ct.convert(
+            traced_torch_model,
+            source="pytorch",
+            inputs=[
+                ct.TensorType(
+                    shape=shape,
+                )
+            ],
+            convert_to=backend[0],
+        )
+
+        assert _METADATA_SOURCE_DIALECT in mlmodel.user_defined_metadata
+
+        assert mlmodel.user_defined_metadata[_METADATA_SOURCE_DIALECT] == "TorchScript"
+
+
+
+@pytest.mark.skipif(not _HAS_EXECUTORCH, reason=MSG_EXECUTORCH_NOT_FOUND)
+class TestEdgeIRValidation:
+    @staticmethod
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_inputs(
+        torch_model, backend
+    ):  # TODO: rdar://115845792 ([Executorch] Handle user provided inputs/outputs in the convert API)
+
+        shape = (1, 10)
+        exir_program = (
+            exir.capture(torch_model, (torch.rand(*shape),), _CAPTURE_CONFIG)
+            .to_edge(_EDGE_COMPILE_CONFIG)
+            .exported_program
+        )
+
+        with pytest.raises(
+            AssertionError, match=r"'inputs' argument should be None for ExportedProgram"
+        ):
+            ct.convert(
+                exir_program,
+                convert_to=backend[0],
+                inputs=[ct.TensorType(shape=shape)],
+            )
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_outputs(
+        torch_model, backend
+    ):  # TODO: rdar://115845792 ([Executorch] Handle user provided inputs/outputs in the convert API)
+
+        shape = (1, 10)
+        exir_program = (
+            exir.capture(torch_model, (torch.rand(*shape),), _CAPTURE_CONFIG)
+            .to_edge(_EDGE_COMPILE_CONFIG)
+            .exported_program
+        )
+
+        with pytest.raises(
+            AssertionError, match=r"'outputs' argument should be None for ExportedProgram"
+        ):
+            ct.convert(exir_program, convert_to=backend[0], outputs=[ct.TensorType(name="result")])
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_source_dialect_metadata(torch_model, backend):
+        shape = (1, 10)
+        exir_program = (
+            exir.capture(torch_model, (torch.rand(*shape),), _CAPTURE_CONFIG)
+            .to_edge(_EDGE_COMPILE_CONFIG)
+            .exported_program
+        )
+
+        mlmodel = ct.convert(
+            exir_program,
+            source="pytorch",
+            convert_to=backend[0],
+        )
+
+        assert _METADATA_SOURCE_DIALECT in mlmodel.user_defined_metadata
+
+        assert mlmodel.user_defined_metadata[_METADATA_SOURCE_DIALECT] == "TorchExport::EDGE"
+
+@pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
+class TestTorchOpsRegistry:
+    @staticmethod
+    def test_api_example():
+        # Example code in https://apple.github.io/coremltools/docs-guides/source/composite-operators.html#using-composite-ops-with-pytorch-conversion
+        # Whenever this test fails, we should update API documentations
+        # This test needs to be modified after rdar://117502178 ([Infra][Pytorch] We should deprecate the direct use of _TORCH_OPS_REGISTRY in 7.2)
+        from coremltools.converters.mil import Builder as mb
+        from coremltools.converters.mil.frontend.torch.ops import _get_inputs
+        from coremltools.converters.mil.frontend.torch.torch_op_registry import (
+            _TORCH_OPS_REGISTRY,
+            register_torch_op,
+        )
+
+        default_func = _TORCH_OPS_REGISTRY.get_func("selu")
+
+        # Test ``__contains__`` and ``__delitem__``
+        assert "selu" in _TORCH_OPS_REGISTRY
+        if "selu" in _TORCH_OPS_REGISTRY:
+            del _TORCH_OPS_REGISTRY["selu"]
+        assert not "selu" in _TORCH_OPS_REGISTRY
+
+        # Test ``@register_torch_op`` decorator
+        @register_torch_op
+        def selu(context, node):
+            x = _get_inputs(context, node, expected=1)[0]
+            x = mb.elu(x=x, alpha=1.6732632423543772)
+            x = mb.mul(x=x, y=1.0507009873554805, name=node.name)
+            context.add(x)
+
+        # Test ``__getitem__``
+        assert _TORCH_OPS_REGISTRY["selu"] is not None
+
+        # Test ``__setitem__``
+        _TORCH_OPS_REGISTRY["selu"] = default_func
+
+    @staticmethod
+    def test_register_torch_op():
+        # Test ``register_torch_op`` works
+        def test_func_dummy(context, inputs):
+            return
+        register_torch_op(test_func_dummy)
+        assert _TORCH_OPS_REGISTRY.name_to_func_mapping["test_func_dummy"] is test_func_dummy
+
+        # Test error out for duplicate registration
+        with pytest.raises(ValueError, match="Torch op test_func_dummy already registered."):
+            register_torch_op(test_func_dummy)
+
+        # Test we can override the function
+        def test_func_dummy(context, inputs):
+            dummy = 1
+            return
+        register_torch_op(test_func_dummy, override=True)
+        assert _TORCH_OPS_REGISTRY.name_to_func_mapping["test_func_dummy"] is test_func_dummy
+
+        # Cleanup the test
+        del _TORCH_OPS_REGISTRY.name_to_func_mapping["test_func_dummy"]
+
 #################################################################################
-# Note: all tests are also used as examples in https://coremltools.readme.io/docs
-# as a reference.
+# Note: Starting from here, all of the following tests are also used as examples
+# in https://coremltools.readme.io/docs as a reference.
 # Whenever any of the following test fails, we should update API documentations
 #################################################################################
 
@@ -1137,11 +1366,15 @@ def test_input_name_specified_by_user(self, float32_input_model_relu_ops,
 
     def test_two_input_model(self, float32_two_input_model):
         # test that error is raised if only 1 input is provided
-        with pytest.raises(ValueError):
-            ct.convert(float32_two_input_model,
-                       inputs=[ct.TensorType(shape=(10, 20), dtype=np.int32)],
-                       minimum_deployment_target=ct.target.macOS12)
-
+        with pytest.raises(
+            ValueError,
+            match="Number of TorchScript inputs \(2\) must match the user provided inputs \(1\).",
+        ):
+            ct.convert(
+                float32_two_input_model,
+                inputs=[ct.TensorType(shape=(10, 20), dtype=np.int32)],
+                minimum_deployment_target=ct.target.macOS12,
+            )
 
         # test forcing 1st input to type int32
         mlmodel = ct.convert(float32_two_input_model,
diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
index 1deb09a1b..b8b02966f 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
@@ -11,17 +11,19 @@
 import numpy as np
 import pytest
 import torch.nn as nn
-import torchaudio
-import torchvision
 
 import coremltools as ct
 from coremltools import RangeDim, Shape, TensorType
-from coremltools._deps import version_lt
+from coremltools._deps import (
+    _HAS_EXECUTORCH,
+    _HAS_TORCH_AUDIO,
+    _HAS_TORCH_VISION,
+    version_lt,
+)
 from coremltools.converters.mil import testing_reqs
 from coremltools.converters.mil.frontend.torch.ops import (
     NUM_TO_TORCH_DTYPE,
     NUMPY_DTYPE_TO_TORCH_NUM,
-    TORCH_DTYPE_TO_NUM,
 )
 from coremltools.converters.mil.mil import Operation, Program, types
 from coremltools.converters.mil.mil.var import Var
@@ -33,7 +35,25 @@
 )
 from coremltools.models.utils import _macos_version, _python_version
 
-from .testing_utils import ModuleWrapper, TorchBaseTest, contains_op, generate_input_data
+from .testing_utils import (
+    ModuleWrapper,
+    TorchBaseTest,
+    TorchFrontend,
+    contains_op,
+    generate_input_data,
+)
+
+if _HAS_TORCH_AUDIO:
+    import torchaudio
+
+if _HAS_TORCH_VISION:
+    import torchvision
+
+
+frontends = [TorchFrontend.TORCHSCRIPT]
+
+if _HAS_EXECUTORCH:
+    frontends.append(TorchFrontend.EDGEIR)
 
 backends = testing_reqs.backends
 compute_units = testing_reqs.compute_units
@@ -182,8 +202,10 @@ def forward(self, x):
             use_scripting=True
         )
 
-    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
-    def test_linear(self, compute_unit, backend):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, frontend", itertools.product(compute_units, backends, frontends)
+    )
+    def test_linear(self, compute_unit, backend, frontend):
         class Model(torch.nn.Module):
             def __init__(self):
                 super(Model, self).__init__()
@@ -199,6 +221,7 @@ def forward(self, x):
             model,
             input_as_shape=False,
             backend=backend,
+            frontend=frontend,
             compute_unit=compute_unit,
             use_scripting=True,
         )
@@ -4092,6 +4115,11 @@ class TestTypeAs(TorchBaseTest):
         itertools.product(compute_units, backends, ["int32", "float32", "bool"]),
     )
     def test_type_as(self, compute_unit, backend, type):
+        if backend == ("mlprogram", "fp16") and type == "bool":
+            pytest.xfail(
+                "rdar://116060011: re-activate coremltools tests blocked by Core ML regressions"
+            )
+
         class TestNet(nn.Module):
             def forward(self, x, y):
                 return x.type_as(y)
@@ -4417,7 +4445,8 @@ def forward(self, x, y):
             converter_input_type=[
                 TensorType(
                     shape=[ct.RangeDim(upper_bound=20 if backend[0] == "mlprogram" else -1), 1]
-                )
+                ),
+                TensorType(shape=(2,)),
             ],
             backend=backend,
             compute_unit=compute_unit,
@@ -4707,6 +4736,11 @@ def forward(self, x, y):
         ),
     )
     def test_unary_einsum(self, compute_unit, backend, equation, dynamic):
+        if backend == ("mlprogram", "fp16") and equation == "iijk->ji" and dynamic:
+            pytest.xfail(
+                "rdar://116060011: re-activate coremltools tests blocked by Core ML regressions"
+            )
+
         class TestUnaryEinsum(nn.Module):
             def forward(self, x):
                 return torch.einsum(equation, x)
@@ -5571,6 +5605,39 @@ def test_bmm(self, compute_unit, backend):
             [shape_x, shape_y], model, backend=backend, compute_unit=compute_unit
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
+    )
+    def test_bmm_with_fp16_inputs(self, compute_unit, backend):
+        if backend == ("mlprogram", "fp16"):
+            pytest.xfail(
+                "rdar://116060011: re-activate coremltools tests blocked by Core ML regressions"
+            )
+
+        class TestModel(torch.nn.Module):
+            def forward(self, x, y):
+                x = x.to(torch.float16)
+                y = y + 1
+                return torch.bmm(x, y)
+
+        inputs = [
+            TensorType(name="x", shape=(1, 2, 3), dtype=np.int32),
+            TensorType(name="y", shape=(1, 3, 2), dtype=np.float16),
+        ]
+
+        self.run_compare_torch(
+            inputs,
+            TestModel(),
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=ct.target.iOS16,
+            torch_device=torch.device("mps"),
+        )
+
 
 class TestNumel(TorchBaseTest):
     @pytest.mark.parametrize(
@@ -5761,6 +5828,29 @@ def forward(self, input_data):
             inputs, TestModel(), backend=backend, compute_unit=compute_unit
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
+    )
+    def test_to_float16(self, compute_unit, backend):
+        class TestModel(torch.nn.Module):
+            def forward(self, input_data):
+                input_data = input_data.to(torch.float16)
+                return input_data + 8
+
+        inputs = [TensorType(name="input_data", shape=(1, 2, 3), dtype=np.float32)]
+        self.run_compare_torch(
+            inputs,
+            TestModel(),
+            backend=backend,
+            compute_unit=compute_unit,
+            atol=0.01,
+            rtol=0.001,
+        )
+
     @pytest.mark.parametrize(
         "compute_unit, backend, input_type",
         itertools.product(
@@ -7243,14 +7333,10 @@ def forward(self, x):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, dynamic",
-        itertools.product(
-            compute_units,
-            backends,
-            [True, False],
-        ),
+        "compute_unit, backend, dynamic, mixed_rank",
+        itertools.product(compute_units, backends, [True, False], [True, False]),
     )
-    def test_tensor_assign_case_8(self, compute_unit, backend, dynamic):
+    def test_tensor_assign_case_8(self, compute_unit, backend, dynamic, mixed_rank):
         # general case with dynamic begin and end
         class TensorAssignModel(torch.nn.Module):
             def forward(self, x, begin_0, begin_1, end_1):
@@ -7260,6 +7346,22 @@ def forward(self, x, begin_0, begin_1, end_1):
 
         shape = (2, 10, 3)
         model = TensorAssignModel()
+
+        if mixed_rank:
+            inputs = [
+                torch.rand(*shape),
+                torch.as_tensor([[[1]]], dtype=torch.int32),
+                torch.as_tensor([1], dtype=torch.int32),
+                torch.as_tensor([[2]], dtype=torch.int32),
+            ]
+        else:
+            inputs = [
+                torch.rand(*shape),
+                torch.as_tensor([1], dtype=torch.int32),
+                torch.as_tensor([1], dtype=torch.int32),
+                torch.as_tensor([2], dtype=torch.int32),
+            ]
+
         if dynamic:
             upper_bound = 10 if backend[0] == "mlprogram" else -1
             converter_input_type = [
@@ -7270,24 +7372,17 @@ def forward(self, x, begin_0, begin_1, end_1):
                         ct.RangeDim(upper_bound=upper_bound),
                     )
                 ),
-                ct.TensorType(shape=(1,), dtype=np.int32),
-                ct.TensorType(shape=(1,), dtype=np.int32),
-                ct.TensorType(shape=(1,), dtype=np.int32),
+                ct.TensorType(shape=inputs[1].shape, dtype=np.int32),
+                ct.TensorType(shape=inputs[2].shape, dtype=np.int32),
+                ct.TensorType(shape=inputs[3].shape, dtype=np.int32),
             ]
         else:
             converter_input_type = None
 
-        inputs = [
-            torch.rand(*shape),
-            torch.as_tensor([1], dtype=torch.int32),
-            torch.as_tensor([1], dtype=torch.int32),
-            torch.as_tensor([2], dtype=torch.int32),
-        ]
-
         torch_inputs = [torch.clone(x) for x in inputs]
         expected_results = model(*torch_inputs)
 
-        self.run_compare_torch(
+        res = self.run_compare_torch(
             inputs,
             model,
             expected_results=expected_results,
@@ -7297,6 +7392,13 @@ def forward(self, x, begin_0, begin_1, end_1):
             compute_unit=compute_unit
         )
 
+        if not mixed_rank:
+            # the fuse_squeeze_expand_dims graph pass is going to
+            # fuse the pattern of ``squeeze -> expand_dims``
+            prog = res[1]._mil_program
+            assert "squeeze" not in get_op_types_in_program(prog)
+            assert "expand_dims" not in get_op_types_in_program(prog)
+
     @pytest.mark.parametrize(
         "compute_unit, backend",
         itertools.product(
@@ -9323,47 +9425,52 @@ def forward(self, x):
             compute_unit=compute_unit
         )
 
-class TestSpectrogram(TorchBaseTest):
-    @pytest.mark.parametrize(
-        "compute_unit, backend, input_shape, spec, power",
-        itertools.product(
-            compute_units,
-            backends,
-            [(1, 1000), (1000,), (3, 1000)], # input shape
-            [torchaudio.transforms.Spectrogram, torchaudio.transforms.MelSpectrogram],
-            [None, 1, 2] # magnitude or power
+
+if _HAS_TORCH_AUDIO:
+
+    class TestSpectrogram(TorchBaseTest):
+        @pytest.mark.parametrize(
+            "compute_unit, backend, input_shape, spec, power",
+            itertools.product(
+                compute_units,
+                backends,
+                [(1, 1000), (1000,), (3, 1000)],  # input shape
+                [torchaudio.transforms.Spectrogram, torchaudio.transforms.MelSpectrogram],
+                [None, 1, 2],  # magnitude or power
+            ),
         )
-    )
-    def test_spectrogram(self, compute_unit, backend, input_shape, spec, power):
-        if platform.machine() != "arm64":
-            pytest.xfail("rdar://108001659 ([PyTorch] Torchaudio Spectrogram Failed on Intel Machine)")
+        def test_spectrogram(self, compute_unit, backend, input_shape, spec, power):
+            if platform.machine() != "arm64":
+                pytest.xfail(
+                    "rdar://108001659 ([PyTorch] Torchaudio Spectrogram Failed on Intel Machine)"
+                )
 
-        if spec is torchaudio.transforms.MelSpectrogram and power is None:
-            pytest.skip("power or magnitude required for melspec")
+            if spec is torchaudio.transforms.MelSpectrogram and power is None:
+                pytest.skip("power or magnitude required for melspec")
 
-        class SpectrogramModel(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-                # the other spectrogram options are passed through to stft
-                # and are tested in TestSTFT
-                self.spec = spec(power=power, n_fft=128)
+            class SpectrogramModel(torch.nn.Module):
+                def __init__(self) -> None:
+                    super().__init__()
+                    # the other spectrogram options are passed through to stft
+                    # and are tested in TestSTFT
+                    self.spec = spec(power=power, n_fft=128)
 
-            def forward(self, x):
-                x = self.spec(x)
-                if power is None:
-                    # complex: stack them
-                    x = torch.stack([torch.real(x), torch.imag(x)], dim=0)
-                return x
+                def forward(self, x):
+                    x = self.spec(x)
+                    if power is None:
+                        # complex: stack them
+                        x = torch.stack([torch.real(x), torch.imag(x)], dim=0)
+                    return x
 
-        np.random.seed(1024)
-        TorchBaseTest.run_compare_torch(
-            input_shape,
-            SpectrogramModel(),
-            backend=backend,
-            compute_unit=compute_unit,
-            rtol=1e-4,
-            atol=1e-4,
-        )
+            np.random.seed(1024)
+            TorchBaseTest.run_compare_torch(
+                input_shape,
+                SpectrogramModel(),
+                backend=backend,
+                compute_unit=compute_unit,
+                rtol=1e-4,
+                atol=1e-4,
+            )
 
 class TestNms(TorchBaseTest):
     @pytest.mark.parametrize(
@@ -9859,7 +9966,9 @@ class TestScaledDotProductAttention(TorchBaseTest):
             [2, 3, 4, 5],
         ),
     )
-    def test_different_input_ranks_no_mask(self, compute_unit, backend, rank):
+    def test_different_input_ranks_no_mask(
+        self, compute_unit, backend, rank, minimum_deployment_target=None
+    ):
         """
         The query/key/value inputs can be any rank 2 or greater.
         """
@@ -9884,12 +9993,14 @@ def test_different_input_ranks_no_mask(self, compute_unit, backend, rank):
             },
         )
 
-        self.run_compare_torch(
+        res = self.run_compare_torch(
             [input_shape] * 3,
             model,
             backend=backend,
             compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
         )
+        return res[1]
 
     @pytest.mark.parametrize(
         "compute_unit, backend, seq_lengths, include_heads",
@@ -9900,7 +10011,9 @@ def test_different_input_ranks_no_mask(self, compute_unit, backend, rank):
             [False, True],
         ),
     )
-    def test_is_causal_flag(self, compute_unit, backend, seq_lengths, include_heads):
+    def test_is_causal_flag(
+        self, compute_unit, backend, seq_lengths, include_heads, minimum_deployment_target=None
+    ):
         source_seq_len, target_seq_len = seq_lengths
         query_shape = (2, 2, target_seq_len, 7) if include_heads else (2, target_seq_len, 7)
         key_shape = (2, 2, source_seq_len, 7) if include_heads else (2, source_seq_len, 7)
@@ -9918,6 +10031,7 @@ def test_is_causal_flag(self, compute_unit, backend, seq_lengths, include_heads)
             model,
             backend=backend,
             compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
         )
         # check that "fill" and "band_part" ops, which are needed to compute mask, have been constant folded
         mil_prog = res[1]._get_mil_internal()
@@ -9934,7 +10048,9 @@ def test_is_causal_flag(self, compute_unit, backend, seq_lengths, include_heads)
             [False, True],
         ),
     )
-    def test_attn_mask(self, compute_unit, backend, seq_lengths, bool_mask):
+    def test_attn_mask(
+        self, compute_unit, backend, seq_lengths, bool_mask, minimum_deployment_target=None
+    ):
         if bool_mask:
             pytest.xfail(
                 "rdar://110499660 ([CI][Bug] test_attn_mask is occasionally failing when bool_mask = True)"
@@ -9960,6 +10076,7 @@ def test_attn_mask(self, compute_unit, backend, seq_lengths, bool_mask):
             model,
             backend=backend,
             compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
             input_as_shape=False,
         )
 
@@ -9971,7 +10088,9 @@ def test_attn_mask(self, compute_unit, backend, seq_lengths, bool_mask):
             [True, False],
         ),
     )
-    def test_toy_xformer_with_sdpa(self, compute_unit, backend, mask_as_input):
+    def test_toy_xformer_with_sdpa(
+        self, compute_unit, backend, mask_as_input, minimum_deployment_target=None
+    ):
         embedding_size = 32
         seq_length = 16
         n_heads = 4
@@ -10061,8 +10180,44 @@ def forward(self, x, mask=None):
             model,
             backend=backend,
             compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
+        )
+
+    def test_dropout_early_error_out(self):
+        B, S, L, E, EV = 3, 5, 7, 16, 32
+
+        query_shape = (B, L, E)
+        key_shape = (B, S, E)
+        value_shape = (B, S, EV)
+
+        query = generate_input_data(query_shape)
+        key = generate_input_data(key_shape)
+        value = generate_input_data(value_shape)
+
+        model = ModuleWrapper(
+            function=nn.functional.scaled_dot_product_attention,
+            kwargs={"dropout_p": 0.0}
+        )
+        self.run_compare_torch(
+            (query, key, value),
+            model,
+            input_as_shape=False,
         )
 
+        with pytest.raises(
+            ValueError,
+            match=r"scaled_dot_product_attention op: dropout is not supported yet",
+        ):
+            model = ModuleWrapper(
+                function=nn.functional.scaled_dot_product_attention,
+                kwargs={"dropout_p": 0.1}
+            )
+            self.run_compare_torch(
+                (query, key, value),
+                model,
+                input_as_shape=False,
+            )
+
 
 class TestTransformer(TorchBaseTest):
     @pytest.mark.parametrize(
@@ -10102,3 +10257,62 @@ def forward(self, x):
                 return torch.fliplr(x)
 
         self.run_compare_torch(input_shape, TestModel(), backend=backend, compute_unit=compute_unit)
+
+
+class TestMultinomial(TorchBaseTest):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, num_samples",
+        itertools.product(compute_units, backends, [1, 3]),
+    )
+    def test_multinomial(self, compute_unit, backend, num_samples):
+        class TestModel(nn.Module):
+            def forward(self, x):
+                return torch.multinomial(x, num_samples, replacement=True)
+
+        # As sampling is random, we make one element significantly larger than others to make
+        # outputs consistent.
+        input_data = torch.tensor([0, 1e5, 0, 0, 1, 1, 1], dtype=torch.float)
+        self.run_compare_torch(
+            input_data,
+            TestModel(),
+            backend=backend,
+            compute_unit=compute_unit,
+            input_as_shape=False,
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(compute_units, backends),
+    )
+    def test_multinomial_not_supported(self, compute_unit, backend):
+        class TestModel(nn.Module):
+            def forward(self, x):
+                return torch.multinomial(x, 4)
+
+        class TestModelDynamicNumSamples(nn.Module):
+            def forward(self, x):
+                return torch.multinomial(x, x.shape[0], replacement=True)
+
+        input_data = torch.tensor([0, 10, 0, 0, 1, 1, 1], dtype=torch.float)
+        with pytest.raises(
+            ValueError,
+            match="When num_samples is larger than 1, only replacement=True is supported.",
+        ):
+            self.run_compare_torch(
+                input_data,
+                TestModel(),
+                backend=backend,
+                compute_unit=compute_unit,
+                input_as_shape=False,
+            )
+
+        with pytest.raises(ValueError, match="In torch.multinomial op, num_samples must be const"):
+            converter_input_type = [TensorType(shape=(RangeDim(1, 10),), dtype=np.float32)]
+            self.run_compare_torch(
+                input_data,
+                TestModelDynamicNumSamples(),
+                backend=backend,
+                compute_unit=compute_unit,
+                input_as_shape=False,
+                converter_input_type=converter_input_type,
+            )
diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_quantization_ops.py b/coremltools/converters/mil/frontend/torch/test/test_torch_quantization_ops.py
index 5a09606ff..7e96a4f38 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_quantization_ops.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_quantization_ops.py
@@ -4,6 +4,7 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 import itertools
+from typing import Optional
 
 import numpy as np
 import pytest
@@ -29,24 +30,57 @@
 torch.backends.quantized.engine = "qnnpack"
 
 
-def _force_quantize_model(model, q_dtype):
+def _force_quantize_model(
+    model: torch.nn.Module,
+    q_dtype: torch.dtype,
+    low: Optional[int] = None,
+    high: Optional[int] = None,
+    scale: Optional[float] = None,
+    zero_point: Optional[int] = None,
+    channel_axis: Optional[int] = None,
+):
     """
     In torch, the quantized model can only be obtained from PTQ.
     This utility allows us to produce an int8 quantized model.
+
+    If channel_axis is set, it will do per-channel quantization instead of per-tensor, for the param
+    that channel_axis is valid for.
     """
-    # modify the parameter to int8
+    if scale is None:
+        scale = 1.0
+    if zero_point is None:
+        zero_point = 0
+
+    # modify the parameter to force the quantization within a specific range.
     with torch.no_grad():
         for name, param in model.named_parameters():
             shape = param.shape
-            new_value = torch.quantize_per_tensor(
-                torch.rand(*shape), scale=1.0, zero_point=0, dtype=q_dtype
+            input_data = (
+                torch.rand(*shape) if low is None else torch.randint(low, high, shape).float()
             )
+            input_data = (input_data - zero_point) * scale
+
+            if channel_axis is not None and -len(shape) <= channel_axis < len(shape):
+                scale = torch.Tensor([scale] * shape[channel_axis])
+                zero_point = torch.Tensor([zero_point] * shape[channel_axis])
+                new_value = torch.quantize_per_channel(
+                    input_data,
+                    scales=scale,
+                    zero_points=zero_point,
+                    axis=channel_axis,
+                    dtype=q_dtype,
+                )
+            else:
+                new_value = torch.quantize_per_tensor(
+                    input_data, scale=scale, zero_point=zero_point, dtype=q_dtype
+                )
+
             param_cls = type(param)
-            kwargs = param.__dict__
             new_value = param_cls(new_value, requires_grad=False).to(torch.device("cpu"))
             model._parameters[name] = new_value
     return model
 
+
 class TorchQuantizationBaseTest(TorchBaseTest):
     @staticmethod
     def run_compare_torch(
@@ -55,6 +89,7 @@ def run_compare_torch(
         atol=1e-04,
         rtol=1e-05,
         input_as_shape=True,
+        minimum_deployment_target=ct.target.iOS17,
     ):
         # TODO(rdar://108472419): properly design a random input
         if input_as_shape:
@@ -69,7 +104,7 @@ def run_compare_torch(
             backend=("mlprogram", "fp32"),
             use_scripting=False,
             compute_unit=ct.ComputeUnit.CPU_ONLY,
-            minimum_deployment_target=ct.target.iOS17,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
 
@@ -346,10 +381,10 @@ def forward(self, x):
         self.run_compare_torch(input_shape, model)
 
     @pytest.mark.parametrize(
-        "quant_dtype",
-        [torch.quint8, torch.qint8],
+        "quant_dtype, channel_axis",
+        itertools.product([torch.quint8, torch.qint8], [0, 1, None]),
     )
-    def test_quantized_params(self, quant_dtype):
+    def test_quantized_params(self, quant_dtype, channel_axis):
         class Model(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -360,7 +395,7 @@ def forward(self, x):
                 return torch.matmul(x, dequanitized_weight)
 
         model = Model()
-        model = _force_quantize_model(model, q_dtype=quant_dtype)
+        model = _force_quantize_model(model, q_dtype=quant_dtype, channel_axis=channel_axis)
         input_shape = [(3, 5)]
         res = self.run_compare_torch(input_shape, model)
         prog = res[1]._mil_program
diff --git a/coremltools/converters/mil/frontend/torch/test/testing_utils.py b/coremltools/converters/mil/frontend/torch/test/testing_utils.py
index 19cc9ffab..e7ca8d86b 100644
--- a/coremltools/converters/mil/frontend/torch/test/testing_utils.py
+++ b/coremltools/converters/mil/frontend/torch/test/testing_utils.py
@@ -3,6 +3,8 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+from enum import Enum
+
 import numpy as np
 import pytest
 import torch
@@ -11,11 +13,27 @@
 import coremltools as ct
 import coremltools.models.utils as coremltoolsutils
 from coremltools import RangeDim, TensorType
-from coremltools._deps import _IS_MACOS
+from coremltools._deps import _HAS_EXECUTORCH, _HAS_TORCH_EXPORT_API, _IS_MACOS
 from coremltools.converters.mil.mil.types.type_mapping import nptype_from_builtin
 from coremltools.converters.mil.testing_utils import ct_convert, validate_minimum_deployment_target
 
-from ..converter import torch_to_mil_types
+from ..torchscript_utils import torch_to_mil_types
+
+if _HAS_TORCH_EXPORT_API:
+    from torch.export import ExportedProgram
+
+if _HAS_EXECUTORCH:
+    from executorch import exir
+
+    _CAPTURE_CONFIG = exir.CaptureConfig(enable_aot=True)
+    _EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig(
+        _check_ir_validity=False,
+    )
+
+
+class TorchFrontend(Enum):
+    TORCHSCRIPT = 1
+    EDGEIR = 2
 
 
 class ModuleWrapper(nn.Module):
@@ -62,7 +80,8 @@ def convert_to_coreml_inputs(input_description, inputs):
     """
     flattened_inputs = _flatten(inputs)
     coreml_inputs = {
-        str(x): inp.numpy().astype(np.float32) for x, inp in zip(input_description, flattened_inputs)
+        str(x): inp.cpu().numpy().astype(np.float32)
+        for x, inp in zip(input_description, flattened_inputs)
     }
 
     for k, v in coreml_inputs.items():
@@ -94,12 +113,16 @@ def _convert_to_inputtype(inputs):
     else:
         inputs = converter_input_type
 
+    if _HAS_EXECUTORCH and isinstance(model_spec, ExportedProgram):
+        inputs = None
+        outputs = None
+
     return ct_convert(model_spec, inputs=inputs, convert_to=backend,
                       source="pytorch", compute_units=compute_unit,
                       minimum_deployment_target=minimum_deployment_target)
 
 
-def generate_input_data(input_size, rand_range=(0, 1)):
+def generate_input_data(input_size, rand_range=(0, 1), torch_device=torch.device("cpu")):
     r1, r2 = rand_range
 
     def random_data(spec):
@@ -115,7 +138,7 @@ def random_data(spec):
 
         data = np.random.rand(*static_shape) if static_shape != () else np.random.rand()
         data = (r1 - r2) * data + r2
-        return torch.from_numpy(np.array(data).astype(dtype))
+        return torch.from_numpy(np.array(data).astype(dtype)).to(torch_device)
 
     if isinstance(input_size, list):
         return [random_data(size) for size in input_size]
@@ -135,7 +158,7 @@ def flatten_and_detach_torch_results(torch_results):
     if isinstance(torch_results, (list, tuple)):
         return [x.detach().numpy() for x in _flatten(torch_results) if x is not None]
     # Do not need to flatten
-    return [torch_results.detach().numpy()]
+    return [torch_results.detach().cpu().numpy()]
 
 
 def convert_and_compare(
@@ -220,6 +243,8 @@ def run_compare_torch(
         converter_input_type=None,
         compute_unit=ct.ComputeUnit.CPU_ONLY,
         minimum_deployment_target=None,
+        torch_device=torch.device("cpu"),
+        frontend=TorchFrontend.TORCHSCRIPT,
     ):
         """
         Traces a model and runs a numerical test.
@@ -228,18 +253,35 @@ def run_compare_torch(
             expected_results <iterable, optional>: Expected result from running pytorch model.
             converter_input_type: If not None, then pass it to the "inputs" argument to the
                 ct.convert() call.
+            frontend: Either TorchFrontend.TORCHSCRIPT or TorchFrontend.EDGEIR
         """
         if minimum_deployment_target is not None:
             validate_minimum_deployment_target(minimum_deployment_target, backend)
 
         model.eval()
         if input_as_shape:
-            input_data = generate_input_data(input_data, rand_range)
-
-        if use_scripting:
-            model_spec = torch.jit.script(model)
+            input_data = generate_input_data(input_data, rand_range, torch_device)
+
+        if frontend == TorchFrontend.TORCHSCRIPT:
+            if use_scripting:
+                model_spec = torch.jit.script(model)
+            else:
+                model_spec = trace_model(model, _copy_input_data(input_data))
+        elif frontend == TorchFrontend.EDGEIR:
+            input_data_clone = _copy_input_data(input_data)
+            if isinstance(input_data_clone, list):
+                input_data_clone = tuple(input_data_clone)
+            elif isinstance(input_data_clone, torch.Tensor):
+                input_data_clone = (input_data_clone,)
+            model_spec = (
+                exir.capture(model, input_data_clone, _CAPTURE_CONFIG)
+                .to_edge(_EDGE_COMPILE_CONFIG)
+                .exported_program
+            )
         else:
-            model_spec = trace_model(model, _copy_input_data(input_data))
+            raise ValueError(
+                f"Unknown value of frontend. Needs to be either TorchFrontend.TORCHSCRIPT or TorchFrontend.EDGEIR. Provided: {frontend}"
+            )
 
         model_spec, mlmodel, coreml_inputs, coreml_results = convert_and_compare(
             input_data,
diff --git a/coremltools/converters/mil/frontend/torch/torch_op_registry.py b/coremltools/converters/mil/frontend/torch/torch_op_registry.py
index 128fdd5ae..3efb284f5 100644
--- a/coremltools/converters/mil/frontend/torch/torch_op_registry.py
+++ b/coremltools/converters/mil/frontend/torch/torch_op_registry.py
@@ -3,7 +3,98 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-_TORCH_OPS_REGISTRY = {}
+from typing import Callable
+
+from coremltools.models._deprecation import deprecated as _deprecated
+
+
+class TorchOpsRegistry:
+    def __init__(self):
+        self.name_to_func_mapping = {}
+
+    def get_func(self, op_lookup: str) -> Callable:
+        """
+        Given a op type key, return the according translation function.
+        Note that the key is sanitized by removing suffix and prefix ``_`` before query.
+        For instance, ``__add__`` -> ``add``, ``sub_`` -> ``sub``.
+        """
+        if op_lookup.startswith("__") and op_lookup.endswith("__"):
+            # Some ops may have double underscore, such as `__and__`.
+            op_lookup = op_lookup[2:-2]
+        elif op_lookup.endswith("_"):
+            # This is an "in place" op.
+            # Look up the standard op instead by removing underscore.
+            op_lookup = op_lookup[:-1]
+
+        return self.name_to_func_mapping.get(op_lookup, None)
+
+    def register_func(self, func=None, torch_alias=None, override=False):
+        """
+        Given an op name and its alias, put the translation function (callable)
+        into the registry.
+        """
+        f_name = func.__name__
+        all_f_names = [f_name]
+        if torch_alias is not None:
+            all_f_names.extend(torch_alias)
+
+        for name in all_f_names:
+            if name.endswith("_"):
+                raise Exception(
+                    f'Attempting to register "{name}" op. Do not register inplace ops. (inplace torch ops'
+                    f' end in a "_"). Instead register the normal op version: "{name[:-1]}". The inplace'
+                    f" version will be supported automatically."
+                )
+            if not override and name in self.name_to_func_mapping:
+                raise ValueError(f"Torch op {name} already registered.")
+            self.set_func_by_name(func, name)
+
+    def set_func_by_name(self, func, name):
+        self.name_to_func_mapping[name] = func
+
+    def is_inplace_op(self, op_lookup: str):
+        """
+        A torch op is considered inplace if the op name endswith ``_``.
+        """
+        return not (op_lookup.startswith("__") and op_lookup.endswith("__")) and op_lookup.endswith(
+            "_"
+        )
+
+    # The following functions will be deprecated after 7.2
+    # rdar://117502178 ([Infra][Pytorch] We should deprecate the direct use of _TORCH_OPS_REGISTRY in 7.2)
+    @_deprecated(
+        suffix="Please use coremltools.converters.mil.frontend.torch.register_torch_op",
+        version="7.2",
+        obj_prefix="_TORCH_OPS_REGISTRY.",
+    )
+    def __contains__(self, key: str) -> bool:
+        return key in self.name_to_func_mapping
+
+    @_deprecated(
+        suffix="Please use coremltools.converters.mil.frontend.torch.register_torch_op",
+        version="7.2",
+        obj_prefix="_TORCH_OPS_REGISTRY.",
+    )
+    def __setitem__(self, key: str, value: Callable) -> None:
+        self.name_to_func_mapping[key] = value
+
+    @_deprecated(
+        suffix="Please use coremltools.converters.mil.frontend.torch.register_torch_op",
+        version="7.2",
+        obj_prefix="_TORCH_OPS_REGISTRY.",
+    )
+    def __delitem__(self, key: str) -> None:
+        del self.name_to_func_mapping[key]
+
+    @_deprecated(
+        suffix="Please use coremltools.converters.mil.frontend.torch.register_torch_op",
+        version="7.2",
+        obj_prefix="_TORCH_OPS_REGISTRY.",
+    )
+    def __getitem__(self, key: str) -> Callable:
+        return self.name_to_func_mapping[key]
+
+_TORCH_OPS_REGISTRY = TorchOpsRegistry()
 
 
 def register_torch_op(_func=None, torch_alias=None, override=False):
@@ -28,28 +119,8 @@ def register_torch_op(_func=None, torch_alias=None, override=False):
         function.
         Otherwise, duplicate registration will error out.
     """
-
     def func_wrapper(func):
-        f_name = func.__name__
-
-        if f_name.endswith("_"):
-            raise Exception(
-                "Attempting to register \"{}\" op. Do not register inplace ops. (inplace torch ops"
-                " end in a \"_\"). Instead register the normal op version: \"{}\". The inplace"
-                " version will be supported automatically.".format(f_name, f_name[:-1])
-            )
-        if not override and f_name in _TORCH_OPS_REGISTRY:
-            raise ValueError("Torch op {} already registered.".format(f_name))
-
-        _TORCH_OPS_REGISTRY[f_name] = func
-
-        if torch_alias is not None:
-            for name in torch_alias:
-                if not override and name in _TORCH_OPS_REGISTRY:
-                    msg = "Torch op alias {} already registered."
-                    raise ValueError(msg.format(name))
-                _TORCH_OPS_REGISTRY[name] = func
-
+        _TORCH_OPS_REGISTRY.register_func(func, torch_alias, override)
         return func
 
     if _func is None:
diff --git a/coremltools/converters/mil/frontend/torch/torchir_passes.py b/coremltools/converters/mil/frontend/torch/torchir_passes.py
index d066d9a9d..b234c5328 100644
--- a/coremltools/converters/mil/frontend/torch/torchir_passes.py
+++ b/coremltools/converters/mil/frontend/torch/torchir_passes.py
@@ -29,11 +29,11 @@ def forward(self, x):    # x a tensor with shape [4,10]
         %3 = copy_(%2, value=[[1], [3]])
         output -> %x
 
-    This graph pass fuses the sequences into a single InternalTorchIRNode of a new kind, which is defined as `_internal_op_tensor_inplace_copy`.
+    This graph pass fuses the sequences into a single InternalTorchIRNode of a new kind, which is defined as `_internal_op_tensor_inplace_copy_`.
 
         input -> %x
         %nodes_to_fuse = [slice(%x, begin=0, end=2, stride=1), select(%1, dim=1, index=4)]
-        %x_internal_tensor_assign_1 = _internal_op_tensor_inplace_copy(%x, value=[[1],[3]], nodes_to_fuse=nodes_to_fuse)
+        %x_internal_tensor_assign_1 = _internal_op_tensor_inplace_copy_(%x, value=[[1],[3]], nodes_to_fuse=nodes_to_fuse)
         output -> x_internal_tensor_assign_1
 
     The _internal_tensor_value_assign op takes an additional internal data member nodes_to_fuse,
@@ -58,12 +58,12 @@ def forward(self, x):    # x a tensor with shape [4,10]
     Output graph:
         input -> %x
         %nodes_to_fuse_1 = [select(%x, dim=0, index=0), select(%1, dim=0, index=0)]
-        %x_internal_tensor_assign_1 = _internal_op_tensor_inplace_copy(%x, value=1, nodes_to_fuse=nodes_to_fuse_1)
+        %x_internal_tensor_assign_1 = _internal_op_tensor_inplace_copy_(%x, value=1, nodes_to_fuse=nodes_to_fuse_1)
         %nodes_to_fuse_2 = [slice(%x, dim=0, begin=1, end=2, stride=1), slice(%4, dim=1, begin=1, end=2, stride=1)]
-        %x_internal_tensor_assign_2 = _internal_op_tensor_inplace_copy(%x_internal_tensor_assign_1, value=[[0]], nodes_to_fuse=nodes_to_fuse_2)
+        %x_internal_tensor_assign_2 = _internal_op_tensor_inplace_copy_(%x_internal_tensor_assign_1, value=[[0]], nodes_to_fuse=nodes_to_fuse_2)
         output -> x_internal_tensor_assign_2
 
-    torch.Tensor.fill_ works in a similar way, except the InternalTorchIRNodes is defined by `_internal_op_tensor_inplace_fill`.
+    torch.Tensor.fill_ works in a similar way, except the InternalTorchIRNodes is defined by `_internal_op_tensor_inplace_fill_`.
 
     A fill_ operator is generated from the following forward pass:
 
@@ -90,10 +90,10 @@ def forward(self, x):    # x a tensor with shape [4,10]
 
         input -> %x
         %y = [empty[](x.shape)]
-        %x_internal_tensor_assign_1 = _internal_op_tensor_inplace_copy(%y, %x)
+        %x_internal_tensor_assign_1 = _internal_op_tensor_inplace_copy_(%y, %x)
         output -> %x_internal_tensor_assign_1
 
-    As a result of side effects of fusing, output of `_internal_op_tensor_inplace_copy` will be renamed to `x_internal_tensor_assign_1`.
+    As a result of side effects of fusing, output of `_internal_op_tensor_inplace_copy_` will be renamed to `x_internal_tensor_assign_1`.
     If `%1` should be renamed to `x_internal_tensor_assign_1` too, the graph will be invalid.
     In this purpose out_alias was introduced.
     """
@@ -151,9 +151,9 @@ def _construct_nodes_to_fuse_inputs(nodes_to_fuse):
                 raise ValueError("No matching select or slice.")
 
             if node.kind == "copy_":
-                kind = "_internal_op_tensor_inplace_copy"
+                kind = "_internal_op_tensor_inplace_copy_"
             else:
-                kind = "_internal_op_tensor_inplace_fill"
+                kind = "_internal_op_tensor_inplace_fill_"
 
             nodes_to_fuse = tensor_to_node_sequence_mapping[node_input]
             if nodes_to_fuse[0].kind in ["select", "slice"]:
@@ -169,7 +169,7 @@ def _construct_nodes_to_fuse_inputs(nodes_to_fuse):
             update_value = node.inputs[1]
             nodes_to_fuse_inputs = _construct_nodes_to_fuse_inputs(nodes_to_fuse)
             tensor_assign_node = InternalTorchIRNode(
-                node=None,
+                name=outputs[0],
                 inputs=[source_tensor, update_value] + nodes_to_fuse_inputs,
                 outputs=outputs,
                 kind=kind,
@@ -273,8 +273,8 @@ def transform_inplace_ops(graph, name_remap_dict=None):
 
 
 def flatten_graph_input_values(graph):
-    """ CoreML can't handle nested iterables of tensors, so we flatten the
-        inputs of any graph that expects them.
+    """CoreML can't handle nested iterables of tensors, so we flatten the
+    inputs of any graph that expects them.
     """
     new_graph_inputs = graph.inputs
     all_new_nodes = []
@@ -306,6 +306,7 @@ def flatten_graph_input_values(graph):
                         inputs=node_inputs,
                         outputs=[_input_name],
                         kind="tupleconstruct",
+                        name=_input_name,
                     )
                 )
             else:
diff --git a/coremltools/converters/mil/frontend/torch/torchscript_utils.py b/coremltools/converters/mil/frontend/torch/torchscript_utils.py
new file mode 100644
index 000000000..5f79e9355
--- /dev/null
+++ b/coremltools/converters/mil/frontend/torch/torchscript_utils.py
@@ -0,0 +1,201 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import torch
+
+from coremltools._deps import version_lt
+from coremltools.converters.mil.mil import types
+
+torch_to_mil_types = {
+    torch.bool: types.bool,
+    torch.float16: types.fp16,
+    torch.float32: types.fp32,
+    torch.float64: types.fp32,
+    torch.int32: types.int32,
+    torch.int64: types.int32,
+}
+
+mil_to_torch_types = {v: k for k, v in torch_to_mil_types.items()}
+
+def _jit_pass_lower_graph(graph, torchscript):
+    """
+    This graph pass does a similar thing as torch._C._jit_pass_lower_graph does.
+    It does three things:
+    1. Rename getattr nodes which produce a torch tensor to match the keys in torch model's state_dict
+    2. Construct the params_dict, with the keys similar to state_dict
+    3. Get the named_buffer dict in torch model
+
+    To be more specific, this graph pass traces down series of GetAttr ops, and rename the final node to match the torch model state_dict.
+    It also replaces the node inputs by the first created tensor node with the same name.
+
+    Example:
+    Input graph:
+    graph(%self.1 : __torch__.torch.nn.modules.Sequential, %input.1 : Tensor):
+    %2 : prim::GetAttr[name="linear"](%self.1)
+    %3 : prim::GetAttr[name="weight"](%2)
+    %4 : prim::GetAttr[name="bias"](%2)
+    %5 : prim::GetAttr[name="bias"](%2) # duplicated node
+    %6 : conv(%input.1, %3, %4)
+    %7 : add(%input.1, %5)
+    return (%6, %7)
+
+    Output graph:
+    graph(%self.1 : __torch__.torch.nn.modules.Sequential, %input.1 : Tensor):
+    %2 : prim::GetAttr[name="linear"](%self.1)
+    %linear.weight : prim::GetAttr[name="weight"](%2)
+    %linear.bias : prim::GetAttr[name="bias"](%2)
+    %5 : prim::GetAttr[name="bias"](%2) # duplicated node, it is not used now
+    %6 : conv(%input.1, %linear.weight, %linear.bias)
+    %7 : add(%input.1, %linear.bias) # the second input is replaced
+    return (%6, %7)
+
+    And a dictionary {"linear.weight": ..., "linear.bias": ...} is returned, to record the parameters values.
+    Note that, those GetAttr nodes are still in the torch ir graph, but they would be removed in a latter
+    graph pass in the coremltools torch internal graph
+
+    """
+
+    """
+    Each getattr node corresponds to a torch object in the torch IR,
+    it could be either:
+    1. torch.nn.modules: submodule in a torch model. For instance, a linear layer in a MLP network.
+    2. torch.Tensor: torch model parameters. For instance, weight for a conv layer.
+    3. torch._C.ScriptObject: quantized torch model parameters.
+    For example, in the graph above, %2 is pointing to the __torch__.torch.nn.modules.Sequential.linear torch submodule.
+    node_to_module_map tracks these mapping.
+
+    node_to_prefic_map track the name for each module,
+    for example, %2 has the prefix name linear and %3 is linear.weight.
+    These names are also keys in the state_dict
+    """
+    node_to_module_map = {}
+    node_to_prefix_map = {}
+    first_node_with_prefix = {}
+    replace_input = {}
+
+    base_module_node = list(graph.inputs())[0]
+    node_to_module_map[base_module_node] = torchscript
+    node_to_prefix_map[base_module_node] = ""
+
+    """
+    params_dict will be contructed in this graph pass. It contains all const tensors needed for the graph computation.
+    And the value is validated against the state_dict if the key is presented in both dictionaries.
+    In some rare cases, state_dict lacks parameters / buffers, so we still need to go through the while graph ourselves.
+    """
+    params_dict = {}
+    state_dict = torchscript.state_dict(keep_vars=True)
+    buffer_dict = {k: v for k, v in torchscript.named_buffers()}
+
+    def _check_is_tensor(node, module):
+        if not isinstance(module, torch.Tensor):
+            return False
+        if str(node.output().type()) not in ("Tensor", "Optional[Tensor]"):
+            raise TypeError(f'Type "{node.output().type()}" not supported')
+        return True
+
+    def _check_is_quantized_tensor(node, module):
+        if not isinstance(module, torch._C.ScriptObject):
+            return False
+        # We only support ScriptObjects that correspond to quantized packed params.
+        assert "PackedParams" in node.output().type().name()
+        return True
+
+    def _lower_graph_block(graph):
+        for node in list(graph.nodes()):
+
+            for block in node.blocks():
+                _lower_graph_block(block)
+
+            for idx, _input in enumerate(list(node.inputs())):
+                if _input in replace_input:
+                    node.replaceInput(idx, replace_input[_input])
+
+            kind = node.kind().split("::")[1].lower()
+            if kind != "getattr":
+                continue
+
+            _input = node.input()
+            _output = node.output()
+            attr_name = getattr(node, node.kindOf("name"))("name")
+
+            module = getattr(node_to_module_map[_input], attr_name)
+            node_to_module_map[_output] = module
+
+            input_prefix = node_to_prefix_map[_input]
+            prefix = input_prefix + '.' + attr_name if input_prefix != "" else attr_name
+            node_to_prefix_map[_output] = prefix
+
+            is_tensor = _check_is_tensor(node, module)
+            is_quantized_tensor = _check_is_quantized_tensor(node, module)
+
+            if is_tensor or is_quantized_tensor:
+                if is_tensor and prefix in state_dict:
+                    assert torch.equal(
+                        module.cpu(), state_dict[prefix].cpu()
+                    ), "tensor value not consistent between torch ir and state_dict"
+                if prefix in params_dict:
+                    assert torch.equal(module.cpu(), params_dict[prefix].cpu())
+                    replace_input[_output] = first_node_with_prefix[prefix]
+                else:
+                    params_dict[prefix] = module
+                    first_node_with_prefix[prefix] = _output
+                    _output.setDebugName(prefix)
+
+    _lower_graph_block(graph)
+    return graph, params_dict, buffer_dict
+
+def _expand_and_optimize_ir(torchscript):
+    """
+    Given a torch.jit.ScriptModule, convert it to a optimized
+    torch._C.Graph and dict of model parameter's names to tensors.
+    """
+    graph = torchscript.forward.graph
+
+    # From PyTorch code: Inline function and method calls.
+    torch._C._jit_pass_inline(graph)
+    # From PyTorch code: This inlines the forked section in the fork()
+    # callsite and replaces uses of the result of wait() calls with the
+    # values produced from the (now-inlined) forked section.
+    torch._C._jit_pass_inline_fork_wait(graph)
+    # Starting from the return node, marks all nodes that feed into the
+    # output, as well as nodes with side effects. Any nodes not marked are
+    # eliminated.
+    torch._C._jit_pass_dce(graph)
+    # From PyTorch code: checks well-formedness and invariants of graph.
+    torch._C._jit_pass_lint(graph)
+    # Replaces a couple specific ops patterns (add, sub, mul, div, chunk).
+    if version_lt(torch, "1.6.0"):
+        torch._C._jit_pass_canonicalize_ops(graph)
+        torch._C._jit_pass_lint(graph)
+
+        # From PyTorch code: This pass catches all of the small, easy to catch
+        # peephole optimizations you might be interested in doing.
+        #     Eliminate no-op 'expand' nodes
+        #     Simplify x.t().t() to x
+        # pass disabled for v1.6.0 and onwards, wrongly captures the shape of dummy inputs during tracing.
+        torch._C._jit_pass_peephole(graph, addmm_fusion_enabled=False)
+    else:
+        # v1.6.0 pass renamed
+        torch._C._jit_pass_canonicalize_graph_fuser_ops(graph)
+    torch._C._jit_pass_lint(graph)
+
+    # From PyTorch docs: Renumber the graph so that all structurally
+    # equivalent graphs have same numbers.
+    graph = torch._C._jit_pass_canonicalize(graph)
+    torch._C._jit_pass_lint(graph)
+    if version_lt(torch, "1.6.0"):
+        # v1.6.0 JIT changes disallows pulling list values out of
+        # prim::Constant. We can only pull scalar values. constant
+        # propagation removes `listConstruct` and results in list values.
+        # We disallow constant prop pass to keep them as scalars, and rely
+        # on our own constant prop to interpret `listConstruct`.
+        torch._C._jit_pass_constant_propagation(graph)
+    # NOTE: Don't need another DCE, it's included in constant propagation.
+    torch._C._jit_pass_lint(graph)
+
+    # Get the params_dict and rename the getattr nodes in the graph
+    graph, params_dict, buffer_dict = _jit_pass_lower_graph(graph, torchscript)
+
+    return graph, params_dict, buffer_dict
diff --git a/coremltools/converters/mil/input_types.py b/coremltools/converters/mil/input_types.py
index 04fa9030a..8c6cf8637 100644
--- a/coremltools/converters/mil/input_types.py
+++ b/coremltools/converters/mil/input_types.py
@@ -310,6 +310,7 @@ def __init__(
             self.symbol = get_new_symbol()
         else:
             from coremltools.converters.mil.mil import Symbol
+
             self.symbol = Symbol(symbol)
         self.lower_bound = lower_bound
         self.upper_bound = upper_bound
diff --git a/coremltools/converters/mil/mil/__init__.py b/coremltools/converters/mil/mil/__init__.py
index 15f4c03b2..2ec248b9e 100644
--- a/coremltools/converters/mil/mil/__init__.py
+++ b/coremltools/converters/mil/mil/__init__.py
@@ -7,13 +7,30 @@
 
 from .block import Block, Function, curr_block
 from .builder import Builder
-from .input_type import (SUPPORT_FLOAT_TYPES, SUPPORT_INT_TYPES, DefaultInputs,
-                         InputSpec, InternalVar, ListInputType,
-                         PyFunctionInputType, TensorInputType, TupleInputType)
+from .input_type import (
+    SUPPORT_FLOAT_TYPES,
+    SUPPORT_INT_TYPES,
+    DefaultInputs,
+    InputSpec,
+    InternalVar,
+    ListInputType,
+    PyFunctionInputType,
+    TensorInputType,
+    TupleInputType,
+)
 from .operation import Operation, mil_list, precondition
-from .program import (InputType, Placeholder, Program, Symbol,
-                      get_existing_symbol, get_new_symbol,
-                      get_new_variadic_symbol)
+from .program import (
+    InputType,
+    Placeholder,
+    Program,
+    Symbol,
+    get_existing_symbol,
+    get_new_symbol,
+    get_new_variadic_symbol,
+)
 from .var import ListVar, Var
-from .ops.defs._op_reqs import register_op
 
+"""
+DO NOT REMOVE THIS COMMENT, since we need to keep the import order.
+"""
+from .ops.defs._op_reqs import register_op
diff --git a/coremltools/converters/mil/mil/block.py b/coremltools/converters/mil/mil/block.py
index 399e65628..4871e1b1f 100644
--- a/coremltools/converters/mil/mil/block.py
+++ b/coremltools/converters/mil/mil/block.py
@@ -5,12 +5,14 @@
 
 import copy
 from collections import Counter, OrderedDict
+from typing import Tuple
 
-from coremltools import _OPSET, _logger as logger
-from coremltools.converters.mil._deployment_compatibility import \
-    AvailableTarget as _target
+from coremltools import _OPSET
+from coremltools import _logger as logger
+from coremltools.converters.mil._deployment_compatibility import AvailableTarget as _target
 
 from . import SPACES, types
+from .operation import Operation
 from .types.symbolic import is_symbolic, k_used_symbols
 from .var import ComplexVar, InternalVar, Var
 from .visitors.dot_visitor import DotVisitor
@@ -895,3 +897,26 @@ def to_str(self, func_name="function"):
             s += self.indented_str(SPACES)
             s += "}\n"
         return s
+
+    def get_max_opset_version_and_op(self) -> Tuple[_target, Operation]:
+        """
+        Find the max opset version among all operations in the function.
+        Returns the opset version Enum and the corresponding op.
+        """
+        max_opset_version = _target.iOS13
+        op_with_max_opset_version = None
+
+        def update_max_opset_version_block(block):
+            nonlocal max_opset_version
+            nonlocal op_with_max_opset_version
+            for op in list(block.operations):
+                for b in op.blocks:
+                    update_max_opset_version_block(b)
+                if not hasattr(op, "_op_variants") or not isinstance(op._op_variants, dict):
+                    continue
+                if op.opset_version > max_opset_version:
+                    max_opset_version = op.opset_version
+                    op_with_max_opset_version = op
+
+        update_max_opset_version_block(self)
+        return max_opset_version, op_with_max_opset_version
diff --git a/coremltools/converters/mil/mil/builder.py b/coremltools/converters/mil/mil/builder.py
index 2f782c27f..68f1b2a27 100644
--- a/coremltools/converters/mil/mil/builder.py
+++ b/coremltools/converters/mil/mil/builder.py
@@ -5,15 +5,16 @@
 
 import numbers
 from collections import defaultdict
+from typing import Callable, List, Optional
 
 import numpy as np
 
 from coremltools import _logger as logger
+from coremltools.converters.mil._deployment_compatibility import AvailableTarget
 from coremltools.converters.mil.mil.types.symbolic import any_symbolic
 
 from .block import Function, curr_block
-from .input_type import (InternalInputType, ListOrTensorInputType,
-                         TensorInputType, TupleInputType)
+from .input_type import InternalInputType, ListOrTensorInputType, TensorInputType, TupleInputType
 from .program import Placeholder, Program
 from .var import InternalVar, Var
 
@@ -163,6 +164,7 @@ def _add_op(cls, op_cls, **kwargs):
             input_spec=op_cls.input_spec,
             op_name=kwargs["name"], before_op=before_op,
             candidate_kv=kwargs))
+        kwargs["enclosing_block"] = curr_block()
         new_op = op_cls(**kwargs)
 
         # Initialize optional input Vars if it wasn't in kwargs
@@ -193,21 +195,92 @@ def TensorSpec(shape, dtype=None):
         return Placeholder(shape, dtype)
 
     @staticmethod
-    def program(input_specs=None, opset_version=None):
+    def _create_function(
+        main_block: Callable,
+        input_specs: Optional[List[Placeholder]] = None,
+        opset_version: Optional[AvailableTarget] = None,
+    ):
         """
+        Utility to construct a pymil function.
+        """
+        if input_specs is None:
+            input_specs = []
 
-        The ``mb.program`` decorator creates a MIL program with a single
-        function (``main``). The input to ``main`` is a tensor.
+        # validate number of function inputs
+        num_args = main_block.__code__.co_argcount
+        arg_names = list(main_block.__code__.co_varnames)[:num_args]
+        if len(input_specs) != num_args:
+            raise ValueError(
+                f"{main_block.__name__} expects {num_args} inputs: {arg_names}. Got {len(input_specs)} input_specs."
+            )
+
+        # create the function
+        input_spec_dict = {k: v for k, v in zip(arg_names, input_specs)}
+        with Function(input_spec_dict, opset_version) as func:
+            input_vars = [func.inputs[a] for a in arg_names]
+            outputs = main_block(*input_vars)
+            if isinstance(outputs, tuple):
+                outputs = list(outputs)
+            elif not isinstance(outputs, list):
+                outputs = [outputs]
+            func.set_outputs(outputs)
+
+        # infer the opset version if not provided
+        max_opset_version, _ = func.get_max_opset_version_and_op()
+        if opset_version is None:
+            func.opset_version = max_opset_version
+
+        return func
+
+    @staticmethod
+    def function(
+        input_specs: Optional[List[Placeholder]] = None,
+        opset_version: Optional[AvailableTarget] = None,
+    ):
+        """
+        The ``mb.function`` decorator creates a MIL function.
 
         Parameters
         ----------
+        input_specs: List[TensorSpec]
+            Describes the function inputs
+
+        opset_version: AvailableTarget enum
+            Describes the opset version of the function
+
+        Examples
+        --------
+        >>> import coremltools as ct
+        >>> @mb.function(input_specs=[mb.TensorSpec(shape=(1,2))], opset_version=ct.target.iOS16)
+        >>> def func(a):
+        >>>     return mb.add(x=a, y=2)
 
-        input_specs: TensorSpec
-            Describes a tensor.
+        """
+        def wrapper(main_block):
+            return Builder._create_function(main_block, input_specs, opset_version)
+
+        return wrapper
+
+    @staticmethod
+    def program(
+        input_specs: Optional[List[Placeholder]] = None,
+        opset_version: Optional[AvailableTarget] = None,
+        function_name: Optional[str] = "main",
+    ):
+        """
+        The ``mb.program`` decorator creates a MIL program with a single
+        function with name ``function_name``.
+
+        Parameters
+        ----------
+        input_specs: List[TensorSpec]
+            Describes the function inputs
 
         opset_version: AvailableTarget enum
             Describes the opset version of the program
 
+        function_name: str
+            Name of the function
 
         Examples
         --------
@@ -217,30 +290,9 @@ def program(input_specs=None, opset_version=None):
         >>>     return mb.add(x=a, y=2)
 
         """
-        if input_specs is None:
-            input_specs = []
-
         def wrapper(main_block):
+            function = Builder._create_function(main_block, input_specs, opset_version)
             program = Program()
-            num_args = main_block.__code__.co_argcount
-            arg_names = list(main_block.__code__.co_varnames)[:num_args]
-            if len(input_specs) != num_args:
-                msg = "{} expects {} inputs: {}. Got {} input_specs."
-                raise ValueError(
-                    msg.format(
-                        main_block.__name__, num_args, arg_names, len(input_specs)
-                    )
-                )
-            input_spec_dict = {k: v for k, v in zip(arg_names, input_specs)}
-            with Function(input_spec_dict, opset_version) as func:
-                input_vars = [func.inputs[a] for a in arg_names]
-                outputs = main_block(*input_vars)
-                if isinstance(outputs, tuple):
-                    outputs = list(outputs)
-                elif not isinstance(outputs, list):
-                    outputs = [outputs]
-                func.set_outputs(outputs)
-                program.add_function("main", func)
+            program.add_function(function_name, function)
             return program
-
         return wrapper
diff --git a/coremltools/converters/mil/mil/input_type.py b/coremltools/converters/mil/mil/input_type.py
index f3b57e491..29cd32bd7 100644
--- a/coremltools/converters/mil/mil/input_type.py
+++ b/coremltools/converters/mil/mil/input_type.py
@@ -276,7 +276,7 @@ def __init__(self, type_domain, **kwargs):
         super().__init__(**kwargs)
 
     def _is_compatible(self, v):
-        result = types.is_scalar(v.dtype) or types.is_tensor(v.dtype)
+        result = types.is_scalar(v.sym_type) or types.is_tensor(v.sym_type)
         result = result and (v.dtype in self.type_domain)
         return result
 
@@ -309,9 +309,6 @@ class ListInputType(_InputType):
     """
     ListInputType allows inputs of type types.list
     """
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
     def _is_compatible(self, v):
         return types.is_list(v.sym_type)
 
@@ -326,14 +323,9 @@ class ListOrTensorInputType(_InputType):
     (1) MIL tensor
     (2) python list/tuple of MIL tensors
     """
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
     def _is_compatible(self, v):
         return (
-            types.is_list(v.sym_type)
-            or types.is_scalar(v.dtype)
-            or types.is_tensor(v.dtype)
+            types.is_list(v.sym_type) or types.is_scalar(v.sym_type) or types.is_tensor(v.sym_type)
         )
 
     @property
@@ -345,9 +337,6 @@ class TupleInputType(_InputType):
     """
     TupleInputType specifies input types of python list/tuple of MIL tensors.
     """
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
     def _is_compatible(self, v):
         # We don't check the detail types within the tuple.
         return isinstance(v, (tuple, list))
@@ -363,10 +352,6 @@ class InternalInputType(_InputType):
     It allows ops to take, for example, python primitive types, instead of
     only the builtin types.
     """
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
     def _is_compatible(self, v):
         return True  # skip type check by default for InternalInputType.
 
@@ -375,9 +360,5 @@ class PyFunctionInputType(InternalInputType):
     """
     Native python function.
     """
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
     def _is_compatible(self, v):
         return callable(v.val)
diff --git a/coremltools/converters/mil/mil/operation.py b/coremltools/converters/mil/mil/operation.py
index 5fdb6add8..71263fd70 100644
--- a/coremltools/converters/mil/mil/operation.py
+++ b/coremltools/converters/mil/mil/operation.py
@@ -9,11 +9,9 @@
 
 from coremltools.converters.mil.mil import types
 from coremltools.converters.mil.mil.types import is_compatible_type
-from coremltools.converters.mil.mil.types.symbolic import (any_symbolic,
-                                                           is_symbolic)
+from coremltools.converters.mil.mil.types.symbolic import any_symbolic, is_symbolic
 
 from . import SPACES
-from .block import curr_block
 from .input_type import DefaultInputs, TensorInputType, TupleInputType
 from .var import ComplexVar, InternalVar, ListVar, Var
 
@@ -143,6 +141,11 @@ class Operation:
     input_types (InputSpec, class attr):
         Read-only named input types from all subclasses. Input types are used
         to validate `inputs`.
+        If an input arg name start with prefix `_`, that indicates the input has the following properties:
+        1. Most of the time, the input is type of ``InternalInputType`` and
+           used only in pymil scope. It doesn't have the corresponding arg / attr
+           in the MIL framework definition.
+        2. It won't be printed in pymil.
 
     inputs [_input_vars] (dict of str --> Var):
         An Operation (subclass of Operation) only has access to input Var,
@@ -163,7 +166,7 @@ def __init__(self, **kwargs):
         self._output_vars = None
         self._input_vars = {}
         self.blocks = []
-        self.enclosing_block = curr_block()
+        self.enclosing_block = kwargs["enclosing_block"]
 
         # Initialize inputs as object attributes (all None)
         for k in self._input_types.keys():
@@ -205,6 +208,7 @@ def _check_expected_inputs(self, kwargs):
             "no_check_var_visibility",  # no_check_var_visibility==True to deviate from SSA
             "no_check_var_types",
             # no_check_var_types==True to force set inputs, even if type does not match with earlier ones
+            "enclosing_block",
         ]
         for k in kwargs.keys():
             if k not in non_attributes and k not in self._input_types:
diff --git a/coremltools/converters/mil/mil/ops/defs/_utils.py b/coremltools/converters/mil/mil/ops/defs/_utils.py
index fe2c1e074..3f084dc84 100644
--- a/coremltools/converters/mil/mil/ops/defs/_utils.py
+++ b/coremltools/converters/mil/mil/ops/defs/_utils.py
@@ -20,6 +20,37 @@
 
 MAX_SIZE_CONSTANT_FOLDING = 1024 * 1024 / 4 # When a fp32 const takes over 1MB, we won't create a const op for that
 
+class ConvPoolingTypeInferenceCache(dict):
+    """
+    An utility class to cache the shape inference of ``conv`` and ``pool`` op.
+    The cache mechanism makes sure ops with the same input shape (symbolic also),
+    and params (``pad, stride, kernel``) would produce the same output shape.
+    """
+    @staticmethod
+    def get_cache_key(
+        input_shape: Tuple[int],
+        pad_type: str,
+        pad: Tuple[int],
+        strides: Tuple[int],
+        kernel: Tuple[int],
+        ceil_mode: bool,
+    ) -> Tuple[Tuple]:
+        return (
+            ("input_shape", input_shape),
+            ("pad_type", pad_type),
+            ("pad", pad),
+            ("strides", strides),
+            ("kernel", kernel),
+            ("ceil_mode", ceil_mode),
+        )
+
+    def __setitem__(self, key, value):
+        if key in self:
+            raise ValueError(f"cache key {key} already exisit.")
+        return dict.__setitem__(self, key, value)
+
+CONV_POOLING_TYPE_INFERENCE_CACHE = ConvPoolingTypeInferenceCache()
+
 def broadcast_shapes(shape_x, shape_y):
     """
     Check and broadcast given input shapes.
@@ -129,7 +160,7 @@ def effective_kernel(kernel_shape, dilations):
             f"kernel_shape ({len(kernel_shape)}) and dilations ({len(dilations)}) "
             f"must be the same length"
         )
-    return [(k - 1) * d + 1 for k, d in zip(kernel_shape, dilations)]
+    return tuple([(k - 1) * d + 1 for k, d in zip(kernel_shape, dilations)])
 
 
 def aggregated_pad(
@@ -161,7 +192,7 @@ def aggregated_pad(
 
 
     Returns:
-        A list of total (before + after) padding for each spatial dimension in kernel_shape.
+        A tuple of total (before + after) padding for each spatial dimension in kernel_shape.
     """
     num_spatial_dims = len(kernel_shape)
     if dilations is None:
@@ -188,19 +219,20 @@ def aggregated_pad(
                 )
             )
         effective_ks = effective_kernel(kernel_shape, dilations)
-        return [
-            int(max(0, s * math.ceil(float(i) / float(s)) - i + k - s))
-            if not is_symbolic(i) else get_new_symbol()
-            for i, k, s in zip(input_shape, effective_ks, strides)
-        ]
+        return tuple(
+            [
+                int(max(0, s * math.ceil(float(i) / float(s)) - i + k - s))
+                if not is_symbolic(i)
+                else get_new_symbol()
+                for i, k, s in zip(input_shape, effective_ks, strides)
+            ]
+        )
     if pad_type == "valid":
-        return [0] * num_spatial_dims
+        return tuple([0] * num_spatial_dims)
     if pad_type == "custom":
         if custom_pad is None or len(custom_pad) != 2 * num_spatial_dims:
             raise ValueError("Invalid custom_pad.")
-        return [
-            custom_pad[2 * d] + custom_pad[2 * d + 1] for d in range(num_spatial_dims)
-        ]
+        return tuple([custom_pad[2 * d] + custom_pad[2 * d + 1] for d in range(num_spatial_dims)])
     raise ValueError('Invalid padding pad_type "{}"'.format(pad_type))
 
 
@@ -242,7 +274,7 @@ def spatial_dimensions_out_shape(
     if dilations is None:
         dilations = [1] * num_spatial_dims
     if custom_pad is None:
-        custom_pad = [0] * num_spatial_dims * 2
+        custom_pad = np.array([0] * num_spatial_dims * 2)
     if not (
         len(input_shape)
         == len(kernel_shape)
@@ -259,6 +291,22 @@ def spatial_dimensions_out_shape(
             "must all be the same length"
         )
 
+    effective_ks = effective_kernel(kernel_shape, dilations)
+    if isinstance(strides, np.ndarray):
+        strides = tuple(strides.tolist())
+    if isinstance(custom_pad, np.ndarray):
+        custom_pad = tuple(custom_pad.tolist())
+    cache_key = CONV_POOLING_TYPE_INFERENCE_CACHE.get_cache_key(
+        input_shape,
+        pad_type,
+        custom_pad,
+        strides,
+        effective_ks,
+        ceil_mode,
+    )
+    if cache_key in CONV_POOLING_TYPE_INFERENCE_CACHE:
+        return CONV_POOLING_TYPE_INFERENCE_CACHE[cache_key]
+
     pad = aggregated_pad(
         pad_type=pad_type,
         kernel_shape=kernel_shape,
@@ -267,7 +315,7 @@ def spatial_dimensions_out_shape(
         dilations=dilations,
         custom_pad=custom_pad,
     )
-    effective_ks = effective_kernel(kernel_shape, dilations)
+
     out_shape = []
     for r in range(num_spatial_dims):
         # only check if `input_shape` (spatial part of the input image) is symbolic, because:
@@ -288,6 +336,7 @@ def spatial_dimensions_out_shape(
             if out_dim <= 0:
                 raise ValueError(f"spatial dimension {r} has invalid output size {out_dim}")
             out_shape.append(out_dim)
+    CONV_POOLING_TYPE_INFERENCE_CACHE[cache_key] = out_shape
     return out_shape
 
 
@@ -409,17 +458,85 @@ def _promoted_var(var, promoted_dtype):
     return input_vars
 
 
+def get_squeeze_axes(squeeze_mask, rank):
+    """
+    Utility function to get the squeeze_axes from squeeze_mask.
+    i.e., returns a list of indices ``i`` where ``squeeze_mask[i] == True``.
+    For instance, given ``squeeze_mask = [True, False, True]``,
+    this utility returns ``[0, 2]``
+    """
+    if squeeze_mask is None:
+        squeeze_mask = [False] * rank
+    squeeze_axes = []
+    for idx, mask in enumerate(squeeze_mask):
+        if mask:
+            squeeze_axes.append(idx)
+    return squeeze_axes
+
+def get_param_val(param):
+    """
+    Given a param, if it is not None, returns param.val, else returns None.
+    """
+    if param is None:
+        return None
+    return param.val
+
+def solve_slice_by_index_slice(x_shape, begin, end, stride, begin_mask, end_mask, squeeze_mask):
+    """
+    Utility function to solve the slices of tensor slicing
+    """
+    # set default values for parameters
+    rank = len(x_shape)
+    begin = [int(i) for i in list(begin[:])]
+    end = [int(i) for i in list(end[:])]
+    if stride is None:
+        stride = [1] * rank
+    if begin_mask is None:
+        begin_mask = [False] * rank
+    if end_mask is None:
+        end_mask = [False] * rank
+    if squeeze_mask is None:
+        squeeze_mask = [False] * rank
+
+    # compute slices
+    slices = []
+    for idx, mask in enumerate(begin_mask):
+        if mask:
+            begin[idx] = None
+    for idx, mask in enumerate(end_mask):
+        if mask:
+            end[idx] = None
+    for idx, mask in enumerate(squeeze_mask):
+        if mask:
+            end[idx] = None
+            stride[idx] = np.iinfo(
+                np.int32
+            ).max  # We slice out only 1 element by setting stride to INF
+    for idx in range(rank):
+        slices.append(slice(begin[idx], end[idx], stride[idx]))
+
+    return tuple(slices)
+
 def solve_slice_by_index_shape(x_shape, begin, end, stride, begin_mask, end_mask, squeeze_mask):
     """
     Helper function to solve the shape of tensor slicing.
     """
-    ret_shape = []
-
+    # set default values
+    rank = len(x_shape)
     if begin is None or len(begin) == 0:
-        begin = [None] * len(x_shape)
+        begin = [None] * rank
     if end is None or len(end) == 0:
-        end = [None] * len(x_shape)
-
+        end = [None] * rank
+    if stride is None:
+        stride = [1] * rank
+    if begin_mask is None:
+        begin_mask = [False] * rank
+    if end_mask is None:
+        end_mask = [False] * rank
+    if squeeze_mask is None:
+        squeeze_mask = [False] * rank
+
+    # basic validation for tensor shape
     if len(begin) != len(x_shape):
         raise TypeError(
             "slice_by_index op: size of 'begin', {}, is not equal to the rank of input, which is {}".format(
@@ -434,6 +551,7 @@ def solve_slice_by_index_shape(x_shape, begin, end, stride, begin_mask, end_mask
         )
 
     # solve for shape inference
+    ret_shape = []
     for idx in range(len(x_shape)):
         # skip if we want to squeeze the dimension
         if squeeze_mask[idx]:
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/elementwise_unary.py b/coremltools/converters/mil/mil/ops/defs/iOS15/elementwise_unary.py
index ec7fa8cc2..c5ebc40d0 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/elementwise_unary.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/elementwise_unary.py
@@ -848,7 +848,7 @@ class cast(Operation):
 
     @classmethod
     def supported_dtypes(cls):
-        return (builtin_to_string(v) for v in cls.type_domains["T"])
+        return [builtin_to_string(v) for v in cls.type_domains["T"]]
 
     def type_inference(self):
         if self.dtype.val not in self.supported_dtypes():
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/linear.py b/coremltools/converters/mil/mil/ops/defs/iOS15/linear.py
index ffa806739..698c680fe 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/linear.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/linear.py
@@ -57,8 +57,10 @@ class linear(Operation):
 
     def default_inputs(self):
         Dout = self.weight.shape[0]
+        # If the bias is not provided, we initialize it a zero vector
+        # with dtype of weight.
         return DefaultInputs(
-            bias=np.array([0.0] * Dout, dtype=nptype_from_builtin(self.x.dtype)),
+            bias=np.array([0.0] * Dout, dtype=nptype_from_builtin(self.weight.dtype)),
         )
 
     def type_inference(self):
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/pool.py b/coremltools/converters/mil/mil/ops/defs/iOS15/pool.py
index b1d25fb2b..ce245f19d 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/pool.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/pool.py
@@ -101,16 +101,26 @@ class avg_pool(Pooling):
         * ``S == len(D_in)``.
 
     pad_type: const str (Required)
-        Must be one of ``valid``, ``same``, ``custom`` or ``same_lower``.
-
-        * ``valid``: No padding. This is equivalent to custom pad with ``pad[i] = 0, for
-          all i``.
-        * ``same`` : This is equivalent to custom pad with ``pad[2*i] + pad[2*i+1] = kernel_size[i]``.
-        * ``custom``: Specify custom padding in the parameter pad. note that ``same``
-          padding is equivalent to custom padding with
-          ``pad[2*i] + pad[2*i+1] = kernel_size[i]``.
-        * ``same_lower``: Similar to ``same`` but the padding
-          will place extra rows/cols on the top/left if the padding amount is odd.
+
+        Must be one of the following:
+
+            * ``valid``: No padding. This is equivalent to custom pad with
+              ``pad[2*i] == pad[2*i+1] == 0, for i=0,...,len(d_in)-1``.
+            * ``custom``: Specify custom padding in the parameter ``pad``.
+            * ``same``: Input is padded such that out spatial shapes are
+              ``d_out[i] = ceil(d_in[i] / strides[i])``.
+            * ``same_lower``: Similar to ``same`` but the padding
+              will place extra rows/cols on the top/left if the padding amount is odd.
+
+        Specifically, for ``i = 0,..,,len(d_in)-1``, the equivalent paddings are
+        calculated as follows:
+
+            * ``dilated_kernel = (K[i] - 1) * dilate[i] + 1``
+            * If ``dilated_kernel`` is odd,
+              ``padding[2*i] = padding[2*i+1] = floor(dilated_kernel / 2)``
+            * Otherwise:
+              ``padding[2*i] = ceil((dilated_kernel - 1) / 2)``,
+              ``padding[2*i+1] = floor((dilated_kernel - 1) / 2)``
 
     pad: const<[P],i32> (Optional. Default to all 0s)
         * ``pad`` represents the number of elements to pad before and after each
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/random.py b/coremltools/converters/mil/mil/ops/defs/iOS15/random.py
index f6663cf48..c89f9fe1a 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/random.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/random.py
@@ -45,13 +45,13 @@ class random_bernoulli(RandomDistribution):
     r"""
     Returns a tensor with the specified shape, with random values from a Bernoulli
     distribution.
-    
+
     .. math::
        f(k) = \begin{cases}1-p  &\text{if } k = 0\\
                         p    &\text{if } k = 1\end{cases}
 
     for :math:`k` in :math:`\{0, 1\}`.
-    
+
     Parameters
     ----------
     shape: <K, i32> (Required)
@@ -62,7 +62,7 @@ class random_bernoulli(RandomDistribution):
         * The probability of sampling ``1``. Defaults to ``0.5``.
     seed: const<i32> (Optional)
         * Seed to create a reproducible sequence of values across multiple invokes.
-    
+
     Returns
     -------
     <\*, T>
@@ -76,7 +76,7 @@ class random_bernoulli(RandomDistribution):
     --------
     random_categorical, random_normal, random_uniform
     """
-    
+
     input_spec = (
         InputSpec(
             shape=TensorInputType(type_domain=types.int32),
@@ -85,7 +85,7 @@ class random_bernoulli(RandomDistribution):
         )
         + RandomDistribution.input_spec
     )
-    
+
     type_domains = {
         "T": (types.fp16, types.fp32),
     }
@@ -106,23 +106,29 @@ def type_inference(self):
 class random_categorical(Operation):
     """
     Returns random values from a categorical distribution.
-    
+
     Parameters
     ----------
-    shape: <\*D_in, T>
-        * N-dimensional tensor, one of ``logits`` (event log-probabilities) or ``probs``
-          (event probabilities). The first ``N - 1`` dimensions specifies distributions,
-          and the last dimension represents a vector of probabilities.
+    x: <\*D_in, T>
+        * N-dimensional tensor which represents ``logits`` (event log-probabilities) or ``probs``
+          (event probabilities) depending on ``mode``. The first ``N - 1`` dimensions specifies
+          distributions, and the last dimension represents a vector of probabilities.
 
     mode: const<str> (Optional)
         One of ``['logits', 'probs']``. Defaults to ``logits``.
+        When set to ``probs``, an element-wise log layer will be added to calculate logits.
 
     size: const<i32> (Optional)
         Number of samples to draw. Defaults to ``1``.
+        When set as ``1``, it's categorical distribution.
+        When set larger than ``1``, it's actually multinomial distribution by drawing with
+        replacement. It means that when a sample index is drawn, it can be drawn again.
+        The categorical distribution is a special case of the multinomial distribution, giving
+        the probabilities of potential outcomes of a single drawing rather than multiple drawings.
 
     seed: const<i32> (Optional)
         Seed to create a reproducible sequence of values across multiple invokes.
-    
+
     Returns
     -------
     <\*D_in[:-1] + [size], T>
@@ -136,14 +142,14 @@ class random_categorical(Operation):
     --------
     random_bernoulli, random_normal, random_uniform
     """
-    
+
     input_spec = InputSpec(
         x=TensorInputType(type_domain="T"),
         mode=TensorInputType(const=True, optional=True, type_domain=types.str),
         size=TensorInputType(const=True, optional=True, type_domain=types.int32),
         seed=TensorInputType(const=True, optional=True, type_domain=types.int32),
     )
-    
+
     type_domains = {
         "T": (types.fp16, types.fp32),
     }
@@ -166,7 +172,7 @@ class random_normal(RandomDistribution):
     r"""
     Returns a tensor with the specified shape, with random values from a normal
     distribution.
-    
+
     Parameters
     ----------
     shape: <K, i32> (Required)
@@ -179,7 +185,7 @@ class random_normal(RandomDistribution):
         The standard deviation (width) of the normal distribution. Defaults to ``1.0``.
     seed: const<i32> (Optional)
         Seed to create a reproducible sequence of values across multiple invokes.
-    
+
     Returns
     -------
     <\*, T>
@@ -193,7 +199,7 @@ class random_normal(RandomDistribution):
     --------
     random_categorical, random_bernoulli, random_uniform
     """
-    
+
     input_spec = (
         InputSpec(
             shape=TensorInputType(type_domain=types.int32),
@@ -203,7 +209,7 @@ class random_normal(RandomDistribution):
         )
         + RandomDistribution.input_spec
     )
-    
+
     type_domains = {
         "T": (types.fp16, types.fp32),
     }
@@ -229,15 +235,15 @@ class random_uniform(RandomDistribution):
     Returns a tensor with the specified shape with random values from a uniform
     distribution. Samples are uniformly distributed over the half-open interval
     ``[low, high)`` (includes low, but excludes high).
-    
+
     .. math::
        p(x) = \frac{1}{high - low}
-    
+
     For a real number :math:`x`.
-    
+
     When ``high == low``, values of ``low`` will be returned. If ``high < low``,
     the results are officially undefined and may eventually raise an error.
-    
+
     Parameters
     ----------
     shape: <K, i32> (Required)
@@ -250,7 +256,7 @@ class random_uniform(RandomDistribution):
         * Upper boundary of the output interval (exclusive). Defaults to ``1.0``.
     seed: const<i32> (Optional)
         * Seed to create a reproducible sequence of values across multiple invokes.
-    
+
     Returns
     -------
     <\*, T>
@@ -264,7 +270,7 @@ class random_uniform(RandomDistribution):
     --------
     random_categorical, random_bernoulli, random_normal
     """
-    
+
     input_spec = (
         InputSpec(
             shape=TensorInputType(type_domain=types.int32),
@@ -274,7 +280,7 @@ class random_uniform(RandomDistribution):
         )
         + RandomDistribution.input_spec
     )
-    
+
     type_domains = {
         "T": (types.fp16, types.fp32),
     }
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_operation.py b/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_operation.py
index 3202c480b..9a5340764 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_operation.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_operation.py
@@ -524,6 +524,10 @@ def type_inference(self):
             if len(pad) % 2 != 0:
                 raise ValueError("Number of elements in the argument Pad must be divisible by 2.")
 
+            for i in range(len(pad)):
+                if not is_symbolic(pad[i]) and pad[i] < 0:
+                    raise ValueError(f"pad must be non-negative integer, got {pad[i]} at index {i}")
+
             pad = pad.reshape(-1, 2)
 
             if pad.shape[0] > len(ret_shape):
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_transformation.py b/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_transformation.py
index a409b36af..9f218ac15 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_transformation.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_transformation.py
@@ -19,7 +19,12 @@
 from coremltools.converters.mil.mil.input_type import DefaultInputs, InputSpec, TensorInputType
 from coremltools.converters.mil.mil.operation import SYMBOL, VALUE
 from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
-from coremltools.converters.mil.mil.ops.defs._utils import solve_slice_by_index_shape
+from coremltools.converters.mil.mil.ops.defs._utils import (
+    get_param_val,
+    get_squeeze_axes,
+    solve_slice_by_index_shape,
+    solve_slice_by_index_slice,
+)
 from coremltools.converters.mil.mil.types.symbolic import (
     any_symbolic,
     any_variadic,
@@ -514,23 +519,16 @@ def default_inputs(self):
             )
 
     def type_inference(self):
-
-        # get tensor and set default value
-        begin = self.begin.val
-        end = self.end.val
-        x_rank = self.x.rank
-        stride = self.stride.val if self.stride is not None else [1] * x_rank
-        begin_mask = (
-            self.begin_mask.val if self.begin_mask is not None else [False] * x_rank
-        )
-        end_mask = self.end_mask.val if self.end_mask is not None else [False] * x_rank
-        squeeze_mask = (
-            self.squeeze_mask.val if self.squeeze_mask is not None else [False] * x_rank
-        )
-
         # solve shape
-        x_shape = self.x.shape
-        ret_shape = solve_slice_by_index_shape(x_shape, begin, end, stride, begin_mask, end_mask, squeeze_mask)
+        ret_shape = solve_slice_by_index_shape(
+            self.x.shape,
+            self.begin.val,
+            self.end.val,
+            get_param_val(self.stride),
+            get_param_val(self.begin_mask),
+            get_param_val(self.end_mask),
+            get_param_val(self.squeeze_mask),
+        )
 
         if len(ret_shape) == 0:
             # Scalar case.
@@ -541,41 +539,21 @@ def type_inference(self):
     def value_inference(self):
         if self.x.sym_val is None or self.begin.val is None or self.end.val is None:
             return None
-        begin = [int(i) for i in list(self.begin.val[:])]
-        end = [int(i) for i in list(self.end.val[:])]
-        stride = [1] * self.x.rank if self.stride is None else self.stride.val
-        begin_mask = (
-            [False] * self.x.rank if self.begin_mask is None else self.begin_mask.val
-        )
-        end_mask = [False] * self.x.rank if self.end_mask is None else self.end_mask.val
-        squeeze_mask = (
-            [False] * self.x.rank
-            if self.squeeze_mask is None
-            else self.squeeze_mask.val
-        )
 
-        slices = []
-        for idx, mask in enumerate(begin_mask):
-            if mask:
-                begin[idx] = None
-        for idx, mask in enumerate(end_mask):
-            if mask:
-                end[idx] = None
-        squeeze_axes = []
-        for idx, mask in enumerate(squeeze_mask):
-            if mask:
-                end[idx] = None
-                stride[
-                    idx
-                ] = 2147483647  # We slice out only 1 element by setting stride to INF
-                squeeze_axes.append(idx)
-        for idx in range(self.x.rank):
-            slices.append(slice(begin[idx], end[idx], stride[idx]))
-
-        slices = tuple(slices)
+        # solve the data slices and slice tensor
+        slices = solve_slice_by_index_slice(
+            self.x.shape,
+            self.begin.val,
+            self.end.val,
+            get_param_val(self.stride),
+            get_param_val(self.begin_mask),
+            get_param_val(self.end_mask),
+            get_param_val(self.squeeze_mask),
+        )
         res = self.x.sym_val[slices]
 
-        # remove squeezed axes
+        # remove squeeze_axes
+        squeeze_axes = get_squeeze_axes(get_param_val(self.squeeze_mask), self.x.rank)
         if len(squeeze_axes) > 0:
             if len(squeeze_axes) == len(res.shape):
                 if len(res) == 0:
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS16/scatter_gather.py b/coremltools/converters/mil/mil/ops/defs/iOS16/scatter_gather.py
index 1e3e88c61..d7f56371f 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS16/scatter_gather.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS16/scatter_gather.py
@@ -4,13 +4,13 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 from coremltools.converters.mil.mil import Operation, types
-from coremltools.converters.mil.mil.input_type import (DefaultInputs,
-                                                       InputSpec,
-                                                       TensorInputType)
-from coremltools.converters.mil.mil.operation import (SYMBOL, VALUE,
-                                                      precondition)
+from coremltools.converters.mil.mil.input_type import DefaultInputs, InputSpec, TensorInputType
+from coremltools.converters.mil.mil.operation import SYMBOL, VALUE, precondition
 from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
 from coremltools.converters.mil.mil.ops.defs._utils import compute_gather
+from coremltools.converters.mil.mil.ops.defs.iOS15.scatter_gather import (
+    gather_along_axis as _gather_along_axis_iOS15,
+)
 from coremltools.converters.mil.mil.ops.defs.iOS16 import _IOS16_TARGET
 
 
@@ -20,12 +20,13 @@ class gather(Operation):
     The iOS16 version.
     This section documents only the differences between this version and the
     iOS 15 :py:class:`~.iOS15.scatter_gather.gather`.
-    
+
     This version supports ``batch_dims``, similar to `tf.gather <https://www.tensorflow.org/api_docs/python/tf/gather>`_.
+    Input parameter ``indices`` now supports ``int16`` and ``uint16``.
 
     Parameters
     ----------
-    x: tensor<\*D, U> (Required)
+    x: tensor<\*D, T> (Required)
     indices: tensor<\*N, I> (Required)
         * Indices values may be negative. More precisely, ``-D[axis]<= v < D[axis]`` for ``v`` in ``indices``.
     axis: const i32 (Optional. Default=``0``)
@@ -50,14 +51,14 @@ class gather(Operation):
     """
 
     input_spec = InputSpec(
-        x=TensorInputType(type_domain="U"),
+        x=TensorInputType(type_domain="T"),
         indices=TensorInputType(type_domain="I"),
         axis=TensorInputType(const=True, optional=True, type_domain=types.int32),
         batch_dims=TensorInputType(const=True, optional=True, type_domain=types.int32)
     )
-    
+
     type_domains = {
-        "U": (types.fp16, types.fp32, types.int32),
+        "T": (types.fp16, types.fp32, types.int32),
         "I": (types.int32, types.uint16, types.int16),
     }
 
@@ -75,11 +76,11 @@ def value_inference(self):
             # only allow x to be symbolic. indices cannot.
             return None
         return compute_gather(
-                params=self.x.sym_val, 
-                indices=self.indices.val, 
-                axis=self.axis.val,
-                batch_dims=self.batch_dims.val
-            )
+            params=self.x.sym_val,
+            indices=self.indices.val,
+            axis=self.axis.val,
+            batch_dims=self.batch_dims.val,
+        )
 
     def type_inference(self):
         # validate parameters
@@ -100,7 +101,7 @@ def type_inference(self):
                 "batch_dims {} must be less or equal to than indices.rank {} for node {}".format(
                     self.batch_dims.val, self.indices.rank, self.name
                 )
-            )         
+            )
 
         output_rank = self.x.rank - 1 + self.indices.rank - self.batch_dims.val
         if output_rank == 0:
@@ -115,6 +116,44 @@ def type_inference(self):
 
         return types.tensor(self.x.dtype, out_shape)
 
+
+@register_op(opset_version=_IOS16_TARGET)
+class gather_along_axis(_gather_along_axis_iOS15):
+    """
+    The iOS16 version.
+    The only difference between this version and the iOS 15 :py:class:`~.iOS15.scatter_gather.gather_along_axis`.
+    is that input parameter ``indices`` now supports ``int16`` and ``uint16``.
+
+    Parameters
+    ----------
+    x: tensor<\*D, T> (Required)
+    indices: tensor<\*K, I> (Required)
+    axis: const i32 (Optional):
+        * Default to ``0``.
+
+    Returns
+    -------
+    tensor<\*D, T>:
+        * Output tensor has the same shape as ``indices``.
+
+    Attributes
+    ----------
+    T: fp16, fp32, i32
+    I: uint16, int16, int32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        indices=TensorInputType(type_domain="I"),
+        axis=TensorInputType(const=True, optional=True, type_domain=types.int32),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32, types.int32),
+        "I": (types.int32, types.uint16, types.int16),
+    }
+
+
 @register_op(opset_version=_IOS16_TARGET)
 class gather_nd(Operation):
     """
@@ -123,11 +162,12 @@ class gather_nd(Operation):
     iOS 15 :py:class:`~.iOS15.scatter_gather.gather_nd`.
 
     This version supports ``batch_dims``.
+    Input parameter ``indices`` now supports ``int16`` and ``uint16``.
 
     Parameters
     ----------
     x: tensor<\*D, T> (Required)
-    indices: tensor<\*K, i32> (Required)
+    indices: tensor<\*K, I> (Required)
     batch_dims: const i32 (Optional. Default=``0``)
         * The number of batch dimensions.
 
@@ -139,6 +179,7 @@ class gather_nd(Operation):
     Attributes
     ----------
     T: fp16, fp32, i32
+    I: uint16, int16, int32
 
     References
     ----------
@@ -146,13 +187,13 @@ class gather_nd(Operation):
     """
 
     input_spec = InputSpec(
-        x=TensorInputType(type_domain="U"),
+        x=TensorInputType(type_domain="T"),
         indices=TensorInputType(type_domain="I"),
         batch_dims=TensorInputType(const=True, optional=True, type_domain=types.int32),
     )
-    
+
     type_domains = {
-        "U": (types.fp16, types.fp32, types.int32),
+        "T": (types.fp16, types.fp32, types.int32),
         "I": (types.int32, types.uint16, types.int16),
     }
 
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS17/scatter_gather.py b/coremltools/converters/mil/mil/ops/defs/iOS17/scatter_gather.py
index 95898c616..f30d3c761 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS17/scatter_gather.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS17/scatter_gather.py
@@ -8,9 +8,6 @@
 from coremltools.converters.mil.mil import types
 from coremltools.converters.mil.mil.input_type import DefaultInputs, InputSpec, TensorInputType
 from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
-from coremltools.converters.mil.mil.ops.defs.iOS15.scatter_gather import (
-    gather_along_axis as _gather_along_axis_iOS15,
-)
 from coremltools.converters.mil.mil.ops.defs.iOS15.scatter_gather import scatter as _scatter_iOS15
 from coremltools.converters.mil.mil.ops.defs.iOS15.scatter_gather import (
     scatter_along_axis as _scatter_along_axis_iOS15,
@@ -19,6 +16,9 @@
     scatter_nd as _scatter_nd_iOS15,
 )
 from coremltools.converters.mil.mil.ops.defs.iOS16.scatter_gather import gather as _gather_iOS16
+from coremltools.converters.mil.mil.ops.defs.iOS16.scatter_gather import (
+    gather_along_axis as _gather_along_axis_iOS16,
+)
 from coremltools.converters.mil.mil.ops.defs.iOS16.scatter_gather import (
     gather_nd as _gather_nd_iOS16,
 )
@@ -247,6 +247,8 @@ class gather(_gather_iOS16):
     This section documents only the differences between this version and the
     iOS 16 :py:class:`~.iOS16.scatter_gather.gather`. The major differences are as follows:
 
+    - Input parameter ``x`` adds support for ``int16``, ``uint16``, ``int8``, and ``uint8``.
+    - Input parameter ``indices`` adds support for ``int8`` and ``uint8``.
     - Input parameter ``indices`` now supports only positive values -- negative values
       are considered out-of-bound. If support for negative indices is required, they must be
       explicitly converted to positive values, using the following::
@@ -262,7 +264,7 @@ class gather(_gather_iOS16):
 
     Parameters
     ----------
-    x: tensor<\*D, U> (Required)
+    x: tensor<\*D, T> (Required)
     indices: tensor<\*N, I> (Required)
         * Indices values may be negative. More precisely, ``-D[axis]<= v < D[axis]`` for ``v`` in ``indices``.
     axis: const i32 (Optional. Default=``0``)
@@ -283,18 +285,31 @@ class gather(_gather_iOS16):
 
     Attributes
     ----------
-    T: fp16, fp32, i32
-    I: uint16, int16, int32
+    T: fp16, fp32, int32, int16, uint16, int8, uint8
+    I: int32, int16, uint16, int8, uint8
     """
 
     input_spec = InputSpec(
-        x=TensorInputType(type_domain="U"),
+        x=TensorInputType(type_domain="T"),
         indices=TensorInputType(type_domain="I"),
         axis=TensorInputType(const=True, optional=True, type_domain=types.int32),
         batch_dims=TensorInputType(const=True, optional=True, type_domain=types.int32),
         validate_indices=TensorInputType(const=True, optional=True, type_domain=types.bool),
     )
 
+    type_domains = {
+        "T": (
+            types.fp16,
+            types.fp32,
+            types.int32,
+            types.int16,
+            types.uint16,
+            types.int8,
+            types.uint8,
+        ),
+        "I": (types.int32, types.int16, types.uint16, types.int8, types.uint8),
+    }
+
     def default_inputs(self):
         return DefaultInputs(axis=0, batch_dims=0, validate_indices=False)
 
@@ -314,17 +329,17 @@ def type_inference(self):
 
 
 @register_op(opset_version=_IOS17_TARGET)
-class gather_along_axis(_gather_along_axis_iOS15):
+class gather_along_axis(_gather_along_axis_iOS16):
     """
     Take the values along ``axis`` at locations ``indices``.
 
     The major differences from the previous version are illustrated in :py:class:`gather`.
-    For more information, see the iOS 15 :py:class:`~.iOS15.scatter_gather.gather_along_axis`.
+    For more information, see the iOS 16 :py:class:`~.iOS16.scatter_gather.gather_along_axis`.
 
     Parameters
     ----------
     x: tensor<\*D, T> (Required)
-    indices: tensor<\*K, i32> (Required)
+    indices: tensor<\*K, I> (Required)
         * ``rank(indices) == rank(x)``.
     axis: const i32 (Optional):
         * Default to ``0``.
@@ -342,16 +357,30 @@ class gather_along_axis(_gather_along_axis_iOS15):
 
     Attributes
     ----------
-    T: fp16, fp32, i32
+    T: fp16, fp32, int32, int16, uint16, int8, uint8
+    I: int32, int16, uint16, int8, uint8
     """
 
     input_spec = InputSpec(
         x=TensorInputType(type_domain="T"),
-        indices=TensorInputType(type_domain=types.int32),
+        indices=TensorInputType(type_domain="I"),
         axis=TensorInputType(const=True, optional=True, type_domain=types.int32),
         validate_indices=TensorInputType(const=True, optional=True, type_domain=types.bool),
     )
 
+    type_domains = {
+        "T": (
+            types.fp16,
+            types.fp32,
+            types.int32,
+            types.int16,
+            types.uint16,
+            types.int8,
+            types.uint8,
+        ),
+        "I": (types.int32, types.int16, types.uint16, types.int8, types.uint8),
+    }
+
     def default_inputs(self):
         return DefaultInputs(
             axis=0,
@@ -383,7 +412,7 @@ class gather_nd(_gather_nd_iOS16):
     Parameters
     ----------
     x: tensor<\*D, T> (Required)
-    indices: tensor<\*K, i32> (Required)
+    indices: tensor<\*K, I> (Required)
     batch_dims: const i32 (Optional. Default=``0``)
         * The number of batch dimensions.
     validate_indices: const bool (Optional)
@@ -400,16 +429,30 @@ class gather_nd(_gather_nd_iOS16):
 
     Attributes
     ----------
-    T: fp16, fp32, i32
+    T: fp16, fp32, int32, int16, uint16, int8, uint8
+    I: int32, int16, uint16, int8, uint8
     """
 
     input_spec = InputSpec(
-        x=TensorInputType(type_domain="U"),
+        x=TensorInputType(type_domain="T"),
         indices=TensorInputType(type_domain="I"),
         batch_dims=TensorInputType(const=True, optional=True, type_domain=types.int32),
         validate_indices=TensorInputType(const=True, optional=True, type_domain=types.bool),
     )
 
+    type_domains = {
+        "T": (
+            types.fp16,
+            types.fp32,
+            types.int32,
+            types.int16,
+            types.uint16,
+            types.int8,
+            types.uint8,
+        ),
+        "I": (types.int32, types.int16, types.uint16, types.int8, types.uint8),
+    }
+
     def default_inputs(self):
         return DefaultInputs(
             batch_dims=0,
diff --git a/coremltools/converters/mil/mil/ops/registry.py b/coremltools/converters/mil/mil/ops/registry.py
index 49946796c..51debd752 100644
--- a/coremltools/converters/mil/mil/ops/registry.py
+++ b/coremltools/converters/mil/mil/ops/registry.py
@@ -98,7 +98,6 @@ def register_op(_cls=None, is_custom_op=False, namespace=None, opset_version=tar
         """
         def class_wrapper(op_cls):
             op_type = op_cls.__name__
-            op_cls.__name__ = op_type
 
             # debug message
             op_msg = "op"
@@ -117,10 +116,10 @@ def class_wrapper(op_cls):
                 # Check that op_type is prefixed with namespace
                 if op_type[: len(namespace)] != namespace:
                     msg = (
-                        "Dialect pp type {} registered under {} namespace must "
-                        + "prefix with {}"
+                        "Dialect op type {} registered under {} namespace must " + "prefix with {}"
                     )
                     raise ValueError(msg.format(op_type, namespace, namespace))
+                op_cls._dialect_namespace = namespace
             else:
                 op_reg = SSAOpRegistry.core_ops
 
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS14/test_conv.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_conv.py
index 0af19fa19..8817aa59b 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS14/test_conv.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_conv.py
@@ -218,6 +218,44 @@ def build(x):
 
 
 class TestConv:
+    @pytest.mark.parametrize(
+        "backend, pad_type",
+        itertools.product(
+            backends,
+            ["valid", "same", "same_lower", "custom"],
+        ),
+    )
+    def test_type_inference_cache_no_pad(self, backend, pad_type):
+        # Test the type inference has the caching mechanism to ensure
+        # same symbolic input shapes results in the same output shape
+        if pad_type == "same_lower" and backend.opset_version == ct.target.iOS15:
+            return
+
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(1, 3, get_new_symbol(), get_new_symbol()), dtype=types.fp32)
+            ],
+            opset_version=backend.opset_version,
+        )
+        def prog(x):
+            weight = np.random.rand(2, 3, 2, 2)
+
+            # Basic conv
+            conv_1 = mb.conv(x=x, weight=weight)
+            conv_2 = mb.conv(x=x, weight=weight)
+            assert conv_1.shape == conv_2.shape
+
+            # With strides / dialations
+            conv_1 = mb.conv(x=x, weight=weight, strides=[1, 2], dilations=[3, 4])
+            conv_2 = mb.conv(x=x, weight=weight, strides=[1, 2], dilations=[3, 4])
+            assert conv_1.shape == conv_2.shape
+
+            # With padding
+            conv_1 = mb.conv(x=x, weight=weight, pad_type=pad_type, pad=[2, 3, 4, 5])
+            conv_2 = mb.conv(x=x, weight=weight, pad_type=pad_type, pad=[2, 3, 4, 5])
+            assert conv_1.shape == conv_2.shape
+            return conv_1
+
     @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
     @pytest.mark.parametrize(
         "compute_unit, backend, padding_mode, conv_dim",
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS14/test_linear.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_linear.py
index c3fafa42c..0da0d8320 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS14/test_linear.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_linear.py
@@ -10,7 +10,7 @@
 
 import coremltools as ct
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil import get_new_symbol, types
 from coremltools.converters.mil.mil.ops.tests.iOS14 import backends
 from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
 from coremltools.converters.mil.mil.types import builtin_to_string, nptype_from_builtin
@@ -109,7 +109,7 @@ def build(x):
         itertools.product(compute_units, backends, [types.int32, types.fp16, types.fp32]),
     )
     def test_default_bias_type(self, compute_unit, backend, input_type):
-        # Test the default bias matches the dtype of x
+        # Test the default bias matches the dtype of x and weight.
         @mb.program(
             input_specs=[mb.TensorSpec(shape=(1, 2), dtype=types.fp32)],
             opset_version=backend.opset_version,
@@ -365,3 +365,35 @@ def test_builder_eval(self):
         equation = "bcd,dce->bce"
         v = mb.einsum(values=(x_val, y_val), equation=equation)
         np.testing.assert_allclose(np.einsum(equation, x_val, y_val), v.val, atol=1e-04, rtol=1e-05)
+
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_symbolic_input_conv_and_einsum(self, backend):
+        """
+        Test a pattern of:
+
+            %1 = conv_1(%x)
+            %2 = conv_2(%x)
+            %3 = transpose(%2, [0, 3, 2, 1])
+            %4 = einsum(%1, %3)
+
+        If ``%x`` has symbolic shape and ``conv_1, conv_2`` have the same
+        configuration, the above program should pass the type inference.
+        """
+
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(1, 3, get_new_symbol(), get_new_symbol()), dtype=types.fp32)
+            ],
+            opset_version=backend.opset_version,
+        )
+        def prog(x):
+            weight = np.random.rand(2, 3, 2, 2)
+            conv_1 = mb.conv(x=x, weight=weight)
+            conv_2 = mb.conv(x=x, weight=weight)
+            conv_2_transpose = mb.transpose(x=conv_2, perm=[0, 3, 2, 1])
+            return mb.einsum(values=(conv_1, conv_2_transpose), equation="abcd,adce->abce")
+
+        assert prog is not None
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS14/test_pool.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_pool.py
index 63ffe3603..2cd35ff27 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS14/test_pool.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_pool.py
@@ -10,13 +10,49 @@
 
 import coremltools as ct
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil import get_new_symbol, types
 from coremltools.converters.mil.mil.ops.tests.iOS14 import backends
 from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
 from coremltools.converters.mil.testing_reqs import compute_units
 
 
 class TestAvgPool:
+    @pytest.mark.parametrize(
+        "backend, pad_type",
+        itertools.product(
+            backends,
+            ["valid", "same", "same_lower", "custom"],
+        ),
+    )
+    def test_type_inference_cache(self, backend, pad_type):
+        # Test the type inference has the caching mechanism to ensure
+        # same symbolic input shapes results in the same output shape
+        if pad_type == "same_lower" and backend.opset_version == ct.target.iOS15:
+            return
+
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(1, 3, get_new_symbol(), get_new_symbol()), dtype=types.fp32)
+            ],
+            opset_version=backend.opset_version,
+        )
+        def prog(x):
+            # Basic pool
+            pool_1 = mb.avg_pool(x=x, kernel_sizes=[1, 2], pad_type=pad_type)
+            pool_2 = mb.avg_pool(x=x, kernel_sizes=[1, 2], pad_type=pad_type)
+            assert pool_1.shape == pool_1.shape
+
+            # With strides
+            pool_1 = mb.avg_pool(x=x, kernel_sizes=[1, 2], strides=[1, 2], pad_type=pad_type)
+            pool_2 = mb.avg_pool(x=x, kernel_sizes=[1, 2], strides=[1, 2], pad_type=pad_type)
+            assert pool_1.shape == pool_1.shape
+
+            # With padding
+            pool_1 = mb.avg_pool(x=x, kernel_sizes=[1, 2], pad_type=pad_type, pad=[2, 3, 4, 5])
+            pool_2 = mb.avg_pool(x=x, kernel_sizes=[1, 2], pad_type=pad_type, pad=[2, 3, 4, 5])
+            assert pool_1.shape == pool_2.shape
+            return pool_1
+
     @pytest.mark.parametrize(
         "compute_unit, backend, inputshape_kernelshape",
         itertools.product(
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS14/test_scatter_gather.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_scatter_gather.py
index a10f516f4..8c7ef8374 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS14/test_scatter_gather.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_scatter_gather.py
@@ -491,15 +491,18 @@ def prog(x):
 
 class TestGatherAlongAxis:
     @pytest.mark.parametrize(
-        "compute_unit, backend",
-        itertools.product(compute_units, backends),
+        "compute_unit, backend, x_dtype, indices_dtype",
+        itertools.product(compute_units, backends, [np.float32, np.float16, np.int32], [np.int32]),
     )
-    def test_builder_to_backend_smoke(self, compute_unit, backend):
-        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-        indices = np.array([[1, 0, 1], [1, 1, 0]], dtype=np.int32)
+    def test_builder_to_backend_smoke(self, compute_unit, backend, x_dtype, indices_dtype):
+        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=x_dtype)
+        indices = np.array([[1, 0, 1], [1, 1, 0]], dtype=indices_dtype)
+        builtin_x_dtype = types.numpy_type_to_builtin_type(x_dtype)
         input_placeholders = {
-            "x": mb.placeholder(shape=x.shape),
-            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
+            "x": mb.placeholder(shape=x.shape, dtype=builtin_x_dtype),
+            "indices": mb.placeholder(
+                shape=indices.shape, dtype=types.numpy_type_to_builtin_type(indices_dtype)
+            ),
         }
 
         input_values = {"x": x, "indices": indices}
@@ -514,19 +517,19 @@ def build(x, indices):
             ]
 
         expected_output_types = [
-            (2, 3, types.fp32),
-            (2, 3, types.fp32),
-            (2, 3, types.fp32),
-            (2, 3, types.fp32),
-            (2, 3, types.fp32),
+            (2, 3, builtin_x_dtype),
+            (2, 3, builtin_x_dtype),
+            (2, 3, builtin_x_dtype),
+            (2, 3, builtin_x_dtype),
+            (2, 3, builtin_x_dtype),
         ]
 
         expected_outputs = [
-            np.array([[4, 2, 6], [4, 5, 3]], dtype=np.float32),
-            np.array([[2, 1, 2], [5, 5, 4]], dtype=np.float32),
-            np.array([[4, 2, 6], [4, 5, 3]], dtype=np.float32),
-            np.array([[2, 1, 2], [5, 5, 4]], dtype=np.float32),
-            np.array([[4, 2, 6], [4, 5, 3]], dtype=np.float32),
+            np.array([[4, 2, 6], [4, 5, 3]], dtype=x_dtype),
+            np.array([[2, 1, 2], [5, 5, 4]], dtype=x_dtype),
+            np.array([[4, 2, 6], [4, 5, 3]], dtype=x_dtype),
+            np.array([[2, 1, 2], [5, 5, 4]], dtype=x_dtype),
+            np.array([[4, 2, 6], [4, 5, 3]], dtype=x_dtype),
         ]
 
         run_compare_builder(
@@ -566,30 +569,36 @@ def prog(x):
 
     @staticmethod
     def _test_builder_to_backend_programmatic(
-        compute_unit, backend, rank_axis, force_non_negative_indices
+        compute_unit, backend, rank_axis, x_dtype, indices_dtype, force_non_negative_indices
     ):
         rank, axis = rank_axis
         x_shape = np.random.randint(low=2, high=8, size=rank)
         indices_shape = np.copy(x_shape)
         indices_shape[axis] = np.random.randint(low=1, high=8)
 
-        x = np.random.rand(*x_shape).astype(np.float32)
+        x = np.random.rand(*x_shape).astype(x_dtype)
 
-        # IOS17 gather_along_axis requires non-negative indices.
-        lower_bound = 0 if force_non_negative_indices else -x_shape[axis]
-        indices = np.random.randint(lower_bound, x_shape[axis], size=indices_shape).astype(np.int32)
+        lower_bound = -x_shape[axis]
+        if force_non_negative_indices or np.issubdtype(indices_dtype, np.unsignedinteger):
+            lower_bound = 0
+        indices = np.random.randint(lower_bound, x_shape[axis], size=indices_shape).astype(
+            indices_dtype
+        )
 
         def build(x, indices):
             return mb.gather_along_axis(x=x, indices=indices, axis=axis)
 
+        builtin_x_dtype = types.numpy_type_to_builtin_type(x_dtype)
         input_placeholders = {
-            "x": mb.placeholder(shape=x.shape),
-            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
+            "x": mb.placeholder(shape=x.shape, dtype=builtin_x_dtype),
+            "indices": mb.placeholder(
+                shape=indices.shape, dtype=types.numpy_type_to_builtin_type(indices_dtype)
+            ),
         }
 
         input_values = {"x": x, "indices": indices}
 
-        expected_output_types = tuple(indices_shape[:]) + (types.fp32,)
+        expected_output_types = tuple(indices_shape[:]) + (builtin_x_dtype,)
         expected_output = np.take_along_axis(x, indices, axis=axis)
 
         run_compare_builder(
@@ -604,15 +613,21 @@ def build(x, indices):
 
     @mark_api_breaking(breaking_opset_version=ct.target.iOS17)
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank_axis",
+        "compute_unit, backend, rank_axis, x_dtype, indices_dtype",
         itertools.product(
             compute_units,
             backends,
             [(rank, axis) for rank in range(1, 5) for axis in range(-rank, rank)],
+            [np.float32, np.float16, np.int32],
+            [np.int32],
         ),
     )
-    def test_builder_to_backend_programmatic(self, compute_unit, backend, rank_axis):
-        self._test_builder_to_backend_programmatic(compute_unit, backend, rank_axis, False)
+    def test_builder_to_backend_programmatic(
+        self, compute_unit, backend, rank_axis, x_dtype, indices_dtype
+    ):
+        self._test_builder_to_backend_programmatic(
+            compute_unit, backend, rank_axis, x_dtype, indices_dtype, False
+        )
 
     @pytest.mark.parametrize(
         "backend, indices_val, validate_indices",
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS14/test_tensor_operation.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_tensor_operation.py
index 53b719c0b..a9f43104c 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS14/test_tensor_operation.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_tensor_operation.py
@@ -1006,6 +1006,17 @@ def prog(x, y):
                 pad = mb.reshape(x=y, shape=[-1])
                 res = mb.pad(x=x, pad=pad)
 
+    @staticmethod
+    def test_error_out_with_invalid_padding_value():
+        with pytest.raises(
+            ValueError,
+            match=r"pad must be non-negative integer, got -1022 at index 6",
+        ):
+
+            @mb.program(input_specs=[mb.TensorSpec(shape=(1, 48, 1, 1024))])
+            def prog(x):
+                y = mb.pad(x=x, pad=[0, 0, 0, 0, 0, 0, -1022, 0], mode="constant")
+                return y
 
 class TestRange1d:
     @pytest.mark.parametrize(
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS14/test_tensor_transformation.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_tensor_transformation.py
index ab24a6eea..1a1d31fc3 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS14/test_tensor_transformation.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_tensor_transformation.py
@@ -344,6 +344,15 @@ def build(x):
             backend=backend,
         )
 
+    @staticmethod
+    def test_expand_dims_value_inference_is_inplace():
+        @mb.program()
+        def prog():
+            const = mb.const(val=[[2, 3], [4, 5]])
+            x = mb.expand_dims(x=const, axes=(1, 2))
+            x.val[0, 0, 0, 0] = 112
+            assert const.val[0, 0] == 112
+            return x
 
 class TestReshape:
     @pytest.mark.parametrize(
@@ -491,6 +500,16 @@ def prog(x):
             assert res_sym_val[0][1] == shape.sym_val[1]
             return res
 
+    @staticmethod
+    def test_reshape_value_inference_is_inplace():
+        @mb.program()
+        def prog():
+            const = mb.const(val=[[2, 3], [4, 5]])
+            x = mb.reshape(x=const, shape=(4, 1))
+            x.val[0, 0] = 112
+            assert const.val[0, 0] == 112
+            return x
+
 class TestReverse:
     @pytest.mark.parametrize(
         "compute_unit, backend",
@@ -891,7 +910,7 @@ def test_builder_eval(self):
             mb.slice_by_index(
                 x=x_val,
                 begin=[1, 1, 1],
-                end=[2, 3, 4],
+                end=[2, 3, 3],
                 stride=[1, 1, 2],
                 begin_mask=[False, False, True],
                 end_mask=[True, False, False],
@@ -1239,6 +1258,16 @@ def test_builder_eval_rank_0(self):
         assert type(v.val) == np.float32
         assert np.isclose(np.squeeze(x), v.val)
 
+    @staticmethod
+    def test_squeeze_value_inference_is_inplace():
+        @mb.program()
+        def prog():
+            const = mb.const(val=[[[2, 3], [4, 5]]])
+            x = mb.squeeze(x=const, axes=(0,))
+            x.val[0, 0] = 112
+            assert const.val[0, 0, 0] == 112
+            return x
+
 
 class TestTranspose:
     @pytest.mark.parametrize(
@@ -1341,6 +1370,15 @@ def build(x):
             backend=backend,
         )
 
+    @staticmethod
+    def test_transpose_value_inference_is_inplace():
+        @mb.program()
+        def prog():
+            const = mb.const(val=[[2, 3], [4, 5]])
+            x = mb.transpose(x=const, perm=(0, 1))
+            x.val[0, 0] = 112
+            assert const.val[0, 0] == 112
+            return x
 
 class TestPixelShuffle:
     @pytest.mark.parametrize(
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS16/test_conv.py b/coremltools/converters/mil/mil/ops/tests/iOS16/test_conv.py
index 5782f39c9..77f875cf8 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS16/test_conv.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS16/test_conv.py
@@ -3,23 +3,18 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-import itertools
 
 import numpy as np
 import pytest
 
-from coremltools.converters.mil import testing_reqs
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import types
 from coremltools.converters.mil.mil.ops.tests.iOS16 import backends
 from coremltools.converters.mil.testing_utils import get_op_types_in_program
 
-compute_units = testing_reqs.compute_units
-
-
 class TestConvolution:
-    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
-    def test_type_inference_with_constexpr_ops(self, compute_unit, backend):
+    @pytest.mark.parametrize("backend", backends)
+    def test_type_inference_with_constexpr_ops(self, backend):
         # Test the type inference of the conv op doesn't error out for constexpr bias
         @mb.program(
             input_specs=[mb.TensorSpec(shape=(1, 3, 4, 4), dtype=types.fp32)],
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS16/test_scatter_gather.py b/coremltools/converters/mil/mil/ops/tests/iOS16/test_scatter_gather.py
index 4e1918623..7d6fa1fbb 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS16/test_scatter_gather.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS16/test_scatter_gather.py
@@ -11,6 +11,9 @@
 import coremltools as ct
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.ops.tests.iOS14.test_scatter_gather import (
+    TestGatherAlongAxis as _TestGatherAlongAxis_iOS14,
+)
 from coremltools.converters.mil.mil.ops.tests.iOS16 import backends
 from coremltools.converters.mil.mil.ops.tests.testing_utils import (
     mark_api_breaking,
@@ -21,17 +24,24 @@
 
 class TestGather:
     @pytest.mark.parametrize(
-        "compute_unit, backend",
-        itertools.product(compute_units, backends),
+        "compute_unit, backend, x_dtype, indices_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [np.float32, np.float16, np.int32],
+            [np.int32, np.int16, np.uint16],
+        ),
     )
-    def test_builder_to_backend_smoke_batch_dims(self, compute_unit, backend):
-        # TODO MAKE SURE RUN ON IOS17
-        x = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=np.float32)
-        indices = np.array([[[1, 0], [0, 1]], [[1, 0], [0, 0]]], dtype=np.int32)
+    def test_builder_to_backend_smoke(self, compute_unit, backend, x_dtype, indices_dtype):
+        x = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=x_dtype)
+        indices = np.array([[[1, 0], [0, 1]], [[1, 0], [0, 0]]], dtype=indices_dtype)
 
+        builtin_x_dtype = types.numpy_type_to_builtin_type(x_dtype)
         input_placeholders = {
-            "x": mb.placeholder(shape=x.shape),
-            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
+            "x": mb.placeholder(shape=x.shape, dtype=builtin_x_dtype),
+            "indices": mb.placeholder(
+                shape=indices.shape, dtype=types.numpy_type_to_builtin_type(indices_dtype)
+            ),
         }
 
         input_values = {"x": x, "indices": indices}
@@ -46,11 +56,11 @@ def build(x, indices):
             ]
 
         expected_output_types = [
-            (2, 2, 2, 2, 3, types.fp32),
-            (2, 2, 2, 3, types.fp32),
-            (2, 2, 2, 2, 2, types.fp32),
-            (2, 2, 2, 2, types.fp32),
-            (2, 2, 2, types.fp32),
+            (2, 2, 2, 2, 3, builtin_x_dtype),
+            (2, 2, 2, 3, builtin_x_dtype),
+            (2, 2, 2, 2, 2, builtin_x_dtype),
+            (2, 2, 2, 2, builtin_x_dtype),
+            (2, 2, 2, builtin_x_dtype),
         ]
 
         expected_outputs = [
@@ -65,14 +75,14 @@ def build(x, indices):
                         [[[10, 11, 12], [7, 8, 9]], [[7, 8, 9], [7, 8, 9]]],
                     ],
                 ],
-                dtype=np.float32,
+                dtype=x_dtype,
             ),
             np.array(
                 [
                     [[[4, 5, 6], [1, 2, 3]], [[1, 2, 3], [4, 5, 6]]],
                     [[[10, 11, 12], [7, 8, 9]], [[7, 8, 9], [7, 8, 9]]],
                 ],
-                dtype=np.float32,
+                dtype=x_dtype,
             ),
             np.array(
                 [
@@ -82,13 +92,13 @@ def build(x, indices):
                         [[[11, 10], [10, 11]], [[11, 10], [10, 10]]],
                     ],
                 ],
-                dtype=np.float32,
+                dtype=x_dtype,
             ),
             np.array(
                 [[[[2, 1], [1, 2]], [[5, 4], [4, 5]]], [[[8, 7], [7, 7]], [[11, 10], [10, 10]]]],
-                dtype=np.float32,
+                dtype=x_dtype,
             ),
-            np.array([[[2, 1], [4, 5]], [[8, 7], [10, 10]]], dtype=np.float32),
+            np.array([[[2, 1], [4, 5]], [[8, 7], [10, 10]]], dtype=x_dtype),
         ]
 
         run_compare_builder(
@@ -127,19 +137,57 @@ def prog(x):
         )
 
 
+class TestGatherAlongAxis(_TestGatherAlongAxis_iOS14):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, x_dtype, indices_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [np.float32, np.float16, np.int32],
+            [np.int32, np.int16, np.uint16],
+        ),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend, x_dtype, indices_dtype):
+        super().test_builder_to_backend_smoke(compute_unit, backend, x_dtype, indices_dtype)
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, rank_axis, x_dtype, indices_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [(rank, axis) for rank in range(1, 5) for axis in range(-rank, rank)],
+            [np.float32, np.float16, np.int32],
+            [np.int32, np.int16, np.uint16],
+        ),
+    )
+    def test_builder_to_backend_programmatic(
+        self, compute_unit, backend, rank_axis, x_dtype, indices_dtype
+    ):
+        super()._test_builder_to_backend_programmatic(
+            compute_unit, backend, rank_axis, x_dtype, indices_dtype, True
+        )
+
+
 class TestGatherNd:
     @pytest.mark.parametrize(
-        "compute_unit, backend",
-        itertools.product(compute_units, backends),
+        "compute_unit, backend, x_dtype, indices_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [np.float32, np.float16, np.int32],
+            [np.int32, np.int16, np.uint16],
+        ),
     )
-    def test_builder_to_backend_smoke_batch_dims(self, compute_unit, backend):
-        # TODO MAKE SURE RUN ON IOS17
+    def test_builder_to_backend_smoke(self, compute_unit, backend, x_dtype, indices_dtype):
         x = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=np.float32)
         indices = np.array([[[1, 0], [0, 1]], [[1, 0], [0, 0]]], dtype=np.int32)
+        builtin_x_dtype = types.numpy_type_to_builtin_type(x_dtype)
 
         input_placeholders = {
-            "x": mb.placeholder(shape=x.shape),
-            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
+            "x": mb.placeholder(shape=x.shape, dtype=builtin_x_dtype),
+            "indices": mb.placeholder(
+                shape=indices.shape, dtype=types.numpy_type_to_builtin_type(indices_dtype)
+            ),
         }
 
         input_values = {"x": x, "indices": indices}
@@ -150,11 +198,11 @@ def build(x, indices):
                 mb.gather_nd(x=x, indices=indices, batch_dims=1),
             ]
 
-        expected_output_types = [(2, 2, 3, types.fp32), (2, 2, types.fp32)]
+        expected_output_types = [(2, 2, 3, builtin_x_dtype), (2, 2, builtin_x_dtype)]
 
         expected_outputs = [
-            np.array([[[7, 8, 9], [4, 5, 6]], [[7, 8, 9], [1, 2, 3]]], dtype=np.float32),
-            np.array([[4, 2], [10, 7]], dtype=np.float32),
+            np.array([[[7, 8, 9], [4, 5, 6]], [[7, 8, 9], [1, 2, 3]]], dtype=x_dtype),
+            np.array([[4, 2], [10, 7]], dtype=x_dtype),
         ]
 
         run_compare_builder(
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS17/test_linear.py b/coremltools/converters/mil/mil/ops/tests/iOS17/test_linear.py
index a020003f6..ce014a3a1 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS17/test_linear.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS17/test_linear.py
@@ -13,6 +13,7 @@
 from coremltools.converters.mil.mil import types
 from coremltools.converters.mil.mil.ops.tests.iOS17 import backends
 from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
+from coremltools.converters.mil.mil.types import builtin_to_string, nptype_from_builtin
 from coremltools.converters.mil.mil.types.type_mapping import numpy_type_to_builtin_type
 from coremltools.converters.mil.testing_reqs import compute_units
 
@@ -59,6 +60,29 @@ def build(x):
             backend=backend,
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend, x_input_type, weight_input_type",
+        itertools.product(
+            compute_units,
+            backends,
+            [types.int32, types.fp16, types.fp32],
+            [types.int32, types.fp16, types.fp32],
+        ),
+    )
+    def test_default_bias_type_ios17(self, compute_unit, backend, x_input_type, weight_input_type):
+        # Start from iOS17, x and weight can have different dtype.
+        # Test the default bias matches the dtype of weight.
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(1, 2), dtype=types.fp32)],
+            opset_version=backend.opset_version,
+        )
+        def prog(x):
+            x = mb.cast(x=x, dtype=builtin_to_string(x_input_type))
+            weight = np.random.rand(3, 2).astype(nptype_from_builtin(weight_input_type))
+            res = mb.linear(x=x, weight=weight)
+            assert res.op.bias.val.dtype == nptype_from_builtin(weight_input_type)
+            return res
+
 
 class TestMatMul:
     @pytest.mark.parametrize(
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS17/test_scatter_gather.py b/coremltools/converters/mil/mil/ops/tests/iOS17/test_scatter_gather.py
index 7199c8a2c..6d5d0cdeb 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS17/test_scatter_gather.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS17/test_scatter_gather.py
@@ -11,10 +11,16 @@
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import types
 from coremltools.converters.mil.mil.ops.tests.iOS14.test_scatter_gather import (
-    TestGatherAlongAxis as _TestGatherAlongAxis_iOS14,
+    TestGatherAlongAxis as _TestGatherAlongAxisIOS14,
 )
 from coremltools.converters.mil.mil.ops.tests.iOS14.test_scatter_gather import (
-    TestScatterAlongAxis as _TestScatterAlongAxis_iOS14,
+    TestScatterAlongAxis as _TestScatterAlongAxisIOS14,
+)
+from coremltools.converters.mil.mil.ops.tests.iOS16.test_scatter_gather import (
+    TestGather as _TestGatherIOS16,
+)
+from coremltools.converters.mil.mil.ops.tests.iOS16.test_scatter_gather import (
+    TestGatherNd as _TestGatherNdIOS16,
 )
 from coremltools.converters.mil.mil.ops.tests.iOS17 import backends
 from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
@@ -107,7 +113,7 @@ class TestScatterAlongAxis:
         ),
     )
     def test_builder_to_backend_programmatic(self, compute_unit, backend, rank_axis):
-        _TestScatterAlongAxis_iOS14._test_builder_to_backend_programmatic(
+        _TestScatterAlongAxisIOS14._test_builder_to_backend_programmatic(
             compute_unit, backend, rank_axis, force_non_negative_indices=True
         )
 
@@ -241,7 +247,19 @@ def build_dynamic(data, indices, updates):
             assert any([err in str(excinfo.value) for err in expected_error_msg])
 
 
-class TestGather:
+class TestGather(_TestGatherIOS16):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, x_dtype, indices_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [np.float32, np.float16, np.int32, np.int16, np.uint16, np.int8, np.uint8],
+            [np.int32, np.int16, np.uint16, np.int8, np.uint8],
+        ),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend, x_dtype, indices_dtype):
+        super().test_builder_to_backend_smoke(compute_unit, backend, x_dtype, indices_dtype)
+
     @pytest.mark.parametrize(
         "backend, indices_val, validate_indices",
         itertools.product(backends, [[-1, 0], [0, 3]], [True, False]),
@@ -276,16 +294,20 @@ def prog(x):
 
 class TestGatherAlongAxis:
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank_axis",
+        "compute_unit, backend, rank_axis, x_dtype, indices_dtype",
         itertools.product(
             compute_units,
             backends,
-            [(rank, axis) for rank in range(1, 5) for axis in range(-rank, rank)],
+            [(rank, axis) for rank in (3,) for axis in (-rank, 0, rank - 1)],
+            [np.float32, np.float16, np.int32, np.int16, np.uint16, np.int8, np.uint8],
+            [np.int32, np.int16, np.uint16, np.int8, np.uint8],
         ),
     )
-    def test_builder_to_backend_programmatic(self, compute_unit, backend, rank_axis):
-        _TestGatherAlongAxis_iOS14._test_builder_to_backend_programmatic(
-            compute_unit, backend, rank_axis, True
+    def test_builder_to_backend_programmatic(
+        self, compute_unit, backend, rank_axis, x_dtype, indices_dtype
+    ):
+        _TestGatherAlongAxisIOS14._test_builder_to_backend_programmatic(
+            compute_unit, backend, rank_axis, x_dtype, indices_dtype, True
         )
 
     @pytest.mark.parametrize(
@@ -327,7 +349,20 @@ def prog(x):
                 opset_version=backend.opset_version,
             )(prog)
 
-class TestGatherNd:
+
+class TestGatherNd(_TestGatherNdIOS16):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, x_dtype, indices_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [np.float32, np.float16, np.int32, np.int16, np.uint16, np.int8, np.uint8],
+            [np.int32, np.int16, np.uint16, np.int8, np.uint8],
+        ),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend, x_dtype, indices_dtype):
+        super().test_builder_to_backend_smoke(compute_unit, backend, x_dtype, indices_dtype)
+
     @pytest.mark.parametrize(
         "backend, indices_val, validate_indices",
         itertools.product(
diff --git a/coremltools/converters/mil/mil/passes/defs/optimize_quantization.py b/coremltools/converters/mil/mil/passes/defs/optimize_quantization.py
index f26045a0d..3b8125c4f 100644
--- a/coremltools/converters/mil/mil/passes/defs/optimize_quantization.py
+++ b/coremltools/converters/mil/mil/passes/defs/optimize_quantization.py
@@ -21,6 +21,136 @@
 from coremltools.converters.mil.mil.passes.pass_registry import register_pass
 
 
+@register_pass(namespace="common")
+class merge_tensorwise_affine_dequantize_with_consecutive_ops(AbstractGraphPass):
+    """
+    This graph pass does const folding to a chain of supported ops starts with a
+    tensor-wise ``constexpr_affine_dequantize`` op. i.e., both ``scale`` and
+    ``zero_point`` are scalar (rank 0).
+
+    For example:
+    Input graph:
+        data -> constexpr_affine_dequantize -> transpose -> expand_dims -> out
+
+    Output graph:
+        new_data -> constexpr_affine_dequantize -> out
+
+    where ``new_data`` is computed by ``data -> transpose -> expand_dims``.
+
+    Note that, the graph pass only supports const folding of a single linked list pattern.
+    For example, the following pattern will not be changed:
+
+        data ---> constexpr_affine_dequantize -> transpose -> out
+              |
+              --> constexpr_affine_dequantize -> reshape -> out_2
+    """
+
+    SUPPORTED_OPS = [
+        "transpose",
+        "reshape",
+        "expand_dims",
+        "squeeze",
+    ]
+
+    def apply(self, prog):
+        for f in prog.functions.values():
+            block_changed = True
+            while block_changed:
+                block_changed = self.merge_tensorwise_affine_dequantize_with_consecutive_ops_block(
+                    f
+                )
+
+    @block_context_manager
+    def merge_tensorwise_affine_dequantize_with_consecutive_ops_block(self, block):
+        fusion_status = False
+        for op in list(block.operations):
+            for b in op.blocks:
+                block_changed = True
+                while block_changed:
+                    block_changed = (
+                        self.merge_tensorwise_affine_dequantize_with_consecutive_ops_block(b)
+                    )
+
+            if op.op_type != "constexpr_affine_dequantize":
+                continue
+
+            fusion_status = self._try_to_transform(op, block)
+            if fusion_status:
+                return fusion_status
+        return fusion_status
+
+    @staticmethod
+    def _apply_equivalent_transform(val, op):
+        if op.op_type not in merge_tensorwise_affine_dequantize_with_consecutive_ops.SUPPORTED_OPS:
+            raise ValueError(f"unsupported op_type {op.op_type}")
+
+        if op.op_type == "transpose":
+            return np.transpose(val, axes=op.perm.val)
+        if op.op_type == "reshape":
+            return np.reshape(val, op.outputs[0].shape)
+        if op.op_type == "expand_dims":
+            return np.expand_dims(val, axis=op.axes.val.tolist())
+        if op.op_type == "squeeze":
+            axes = op.axes
+            if axes is None or axes.val is None:
+                return np.squeeze(val)
+            return np.squeeze(val, axis=tuple(op.axes.val.tolist()))
+
+    @staticmethod
+    def _try_to_transform(op, block):
+        # first check if it is tensorwise quantization
+        if op.scale.rank != 0 or op.zero_point.rank != 0:
+            return False
+
+        # first check if quantized_data only feeds into a single op
+        if len(op.quantized_data.child_ops) != 1:
+            return False
+
+        # traverse the graph to get a chain of applicable ops to fold
+        ops_to_fold = []
+        cursor = op
+        while True:
+            prev_cursor = cursor
+            if cursor.outputs[0] in block.outputs:
+                break
+            for val in merge_tensorwise_affine_dequantize_with_consecutive_ops.SUPPORTED_OPS:
+                if _check_child_op_type(cursor, val):
+                    ops_to_fold.append(cursor.outputs[0].child_ops[0])
+                    cursor = ops_to_fold[-1]
+                    break
+            if prev_cursor == cursor:
+                break
+
+        if len(ops_to_fold) == 0:
+            return False
+
+        # do the same transformation on the source quantized data
+        cursor = op.quantized_data.val
+        for val in ops_to_fold:
+            cursor = (
+                merge_tensorwise_affine_dequantize_with_consecutive_ops._apply_equivalent_transform(
+                    cursor, val
+                )
+            )
+
+        # after transformation, we create a new constexpr_affine_dequantize op and do the replacement
+        new_var = mb.constexpr_affine_dequantize(
+            quantized_data=cursor,
+            zero_point=op.zero_point,
+            scale=op.scale,
+            axis=op.axis,
+            name=ops_to_fold[-1].outputs[0].name,
+            before_op=ops_to_fold[-1],
+        )
+        block.replace_uses_of_var_after_op(
+            anchor_op=ops_to_fold[-1],
+            old_var=ops_to_fold[-1].outputs[0],
+            new_var=new_var,
+            force_replace=True,
+        )
+        block.remove_ops([op] + ops_to_fold)
+        return True
+
 @register_pass(namespace="common")
 class int_op_canonicalization(AbstractGraphPass):
     """
diff --git a/coremltools/converters/mil/mil/passes/defs/optimize_tensor_operation.py b/coremltools/converters/mil/mil/passes/defs/optimize_tensor_operation.py
index 5733ab1b7..f0c0fd8ff 100644
--- a/coremltools/converters/mil/mil/passes/defs/optimize_tensor_operation.py
+++ b/coremltools/converters/mil/mil/passes/defs/optimize_tensor_operation.py
@@ -19,6 +19,83 @@
 from coremltools.converters.mil.mil.types.symbolic import any_symbolic
 
 
+@register_pass(namespace="common")
+class fuse_squeeze_expand_dims(AbstractGraphPass):
+    """
+    Detect the pattern ``input-->squeeze-->expand_dims``, and fuse
+    them into an ``identity`` op if ``squeeze`` and ``expand_dims`` cancel out each other.
+    Note that, the ``identity`` can be further removed by ``noop_elimination``.
+
+    .. code-block::
+
+        Given:
+            %x[3, 1, 4, 1]
+            %1[3, 4] = squeeze(%x, axes=[1, 3])
+            %2[3, 1, 4, 1] = expand_dims(%1, axes=[1, 3])
+            %3 = op(%2)
+
+        Result:
+            %x[3, 1, 4, 1]
+            %2[3, 1, 4, 1] = identity(%x)
+            %3 = op(%2)
+    """
+
+    def apply(self, prog):
+        for f in prog.functions.values():
+            block_changed = True
+            while block_changed:
+                block_changed = self.fuse_squeeze_expand_dims_block(f)
+
+    @block_context_manager
+    def fuse_squeeze_expand_dims_block(self, block):
+        fusion_status = False
+        for op in list(block.operations):
+            for b in op.blocks:
+                block_changed = True
+                while block_changed:
+                    block_changed = self.fuse_squeeze_expand_dims_block(b)
+
+            if len(op.blocks) > 0:
+                continue
+
+            squeeze_op = self._match_pattern(op)
+            if squeeze_op is not None:
+                fusion_status = self._try_to_transform(squeeze_op, block)
+                # has to break as the downstream iterator is affected.
+                if fusion_status:
+                    return fusion_status
+        return fusion_status
+
+    @staticmethod
+    def _match_pattern(op):
+        if op.op_type != "squeeze":
+            return None
+        if not _check_child_op_type(op, "expand_dims"):
+            return None
+        return op
+
+    @staticmethod
+    def _try_to_transform(op, block):
+        expand_dims_op = op.outputs[0].child_ops[0]
+        x = op.x
+        out_var = expand_dims_op.outputs[0]
+        if x.shape != out_var.shape:
+            return False
+        if op.outputs[0] in block.outputs:
+            return False
+
+        new_var = mb.identity(x=x, before_op=op)
+        if op.enclosing_block.try_replace_uses_of_var_after_op(
+            anchor_op=expand_dims_op,
+            old_var=out_var,
+            new_var=new_var,
+        ):
+            # Remove all the ops at once
+            block.remove_ops([op, expand_dims_op])
+            return True
+        return False
+
+
 @register_pass(namespace="common")
 class expand_high_rank_reshape_and_transpose(AbstractGraphPass):
     """
diff --git a/coremltools/converters/mil/mil/passes/defs/quantization.py b/coremltools/converters/mil/mil/passes/defs/quantization.py
index 4ddea93a6..fab3e1656 100644
--- a/coremltools/converters/mil/mil/passes/defs/quantization.py
+++ b/coremltools/converters/mil/mil/passes/defs/quantization.py
@@ -3,6 +3,7 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+from abc import abstractmethod
 from enum import Enum as _Enum
 from typing import Set, Text
 
@@ -11,10 +12,13 @@
 from coremltools.converters.mil._deployment_compatibility import AvailableTarget
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import Operation, types
+from coremltools.converters.mil.mil.block import is_current_opset_version_compatible_with
+from coremltools.converters.mil.mil.ops.registry import SSAOpRegistry
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
 from coremltools.converters.mil.mil.passes.helper import block_context_manager
 from coremltools.converters.mil.mil.passes.pass_registry import register_pass
 from coremltools.converters.mil.mil.program import Program
+from coremltools.converters.mil.mil.types.symbolic import is_symbolic
 
 
 class ComputePrecision(_Enum):
@@ -30,7 +34,6 @@ class AbstractQuantizationPass(AbstractGraphPass):
         - is_valid_op(op)
         - transform_op(op)
     """
-
     type_eps = {}
     type_min = {}
     type_negmin = {}
@@ -119,7 +122,127 @@ def __str__(self):
         return type(self).__name__
 
 
-class FP16ComputePrecision(AbstractQuantizationPass):
+class CastTypeQuantization(AbstractQuantizationPass):
+    """
+    Base class for all type casting related quantization, such as fp32->fp16, int32->int16, etc.
+
+    For each valid op, if the "op_selector" return True:
+    - For each input with dtype `origin_dtype`, inject a "cast" op to change it to `target_dtype`.
+    - For each output with dtype `target_dtype`, inject a "cast" op to change it back to `origin_dtype`.
+    All child classes need to specify `origin_dtype` and `target_dtype`.
+    """
+
+    def __init__(self, op_selector=None):
+        super().__init__(op_selector=op_selector)
+
+        # Var that feeds into multiple ops will be cast once and cached into this dict
+        # For reference: Checkout test_single_input_to_multiple_operations in `TestFP16CastTransform`.
+        self.cache_vars = {}
+
+    @property
+    @abstractmethod
+    def origin_dtype(self) -> str:
+        """Original dtype that need to be cast, such as fp32."""
+        raise NotImplementedError("origin_dtype must be specified in subclass.")
+
+    @property
+    @abstractmethod
+    def target_dtype(self) -> str:
+        """Target dtype, such as fp16."""
+        raise NotImplementedError("target_dtype must be specified in subclass.")
+
+    def should_cast_parameter(self, op: Operation, param_name: str) -> bool:
+        """
+        Determines if a param of an op should be cast to target_dtype.
+
+        There are two cases that an op shouldn't be cast:
+        1. The op's parameter doesn't support target_dtype.
+        2. The cast op itself doesn't support target_dtype
+        """
+        type_domain = getattr(op.input_spec.input_types[param_name], "type_domain", None)
+        if type_domain and types.string_to_builtin(self.target_dtype) not in type_domain:
+            return False
+        if self.target_dtype not in SSAOpRegistry._get_core_op_cls("cast").supported_dtypes():
+            return False
+
+        return True
+
+    def transform_op(self, op) -> None:
+        """Transform the input(s)/output(s) dtypes of the op."""
+        block = op.enclosing_block
+        casted_inputs = {}
+        inputs_modified = False
+
+        for param, inputs in op.inputs.items():
+            if not self.should_cast_parameter(op, param):
+                continue
+
+            is_list_input = isinstance(inputs, (list, tuple))
+            if not is_list_input:
+                inputs = [inputs]
+
+            casted_inputs[param] = list(inputs[:])
+            for i, var in enumerate(inputs):
+                if not var.is_tensor_or_scalar_of(dtype=self.origin_dtype):
+                    continue
+
+                inputs_modified = True
+                casted_var_name = f"{var.name}_to_{self.target_dtype}"
+                if (
+                    len(var._child_ops) > 1
+                    and casted_var_name in self.cache_vars
+                    and (block.is_var_visible_in_block(self.cache_vars[casted_var_name]))
+                ):
+                    casted_inputs[param][i] = self.cache_vars[casted_var_name]
+                else:
+                    x = mb.cast(x=var, dtype=self.target_dtype, name=casted_var_name, before_op=op)
+                    if self.target_dtype == "fp16":
+                        self._check_underflow_to_zero(x, var)
+
+                    casted_inputs[param][i] = x
+                    if len(var._child_ops) > 1:
+                        self.cache_vars[casted_var_name] = casted_inputs[param][i]
+
+            if not is_list_input:
+                casted_inputs[param] = casted_inputs[param][0]
+
+        if inputs_modified:
+            casted_inputs.update({k: v for k, v in op.inputs.items() if k not in casted_inputs})
+            casted_inputs["name"] = f"{op.name}_cast_{self.target_dtype}"
+            casted_inputs["before_op"] = op
+            quant_output = getattr(mb, op.op_type)(**casted_inputs)
+
+            if not isinstance(quant_output, (list, tuple)):
+                quant_output = [quant_output]
+
+            for old_output_var, new_output_var in zip(op.outputs, quant_output):
+                if old_output_var.is_tensor_or_scalar_of(dtype=self.origin_dtype) and (
+                    not new_output_var.is_tensor_or_scalar_of(dtype=self.origin_dtype)
+                ):
+                    x = mb.cast(
+                        x=new_output_var,
+                        dtype=self.origin_dtype,
+                        name=f"{new_output_var.name}_to_{self.origin_dtype}",
+                        before_op=op,
+                    )
+                    op.enclosing_block.replace_uses_of_var_after_op(
+                        anchor_op=op,
+                        old_var=old_output_var,
+                        new_var=x,
+                        force_replace=True,
+                    )
+                else:
+                    op.enclosing_block.replace_uses_of_var_after_op(
+                        anchor_op=op,
+                        old_var=old_output_var,
+                        new_var=new_output_var,
+                        force_replace=True,
+                    )
+
+            block.remove_ops([op])
+
+
+class FP16ComputePrecision(CastTypeQuantization):
     """
     This transform does the following, for each valid op and if the "op_selector" return True:
     - For each input of dtype float32, inject a "cast" op to change it to float16 dtype
@@ -137,21 +260,43 @@ class FP16ComputePrecision(AbstractQuantizationPass):
     }
     _ELEMENTWISE_UNARY_EPSILON_OPS: Set[str] = {"inverse", "log", "rsqrt"}
 
+    # Unsupported op for fp16 casting
+    _UNSUPPORTED_FP16_OPS: Set[str] = {
+        "cast",
+        "while_loop",
+        "cond",
+        # TODO: Remove after supporting FP16 dynamic quantize transformation for list ops (rdar://74458192)
+        "make_list",
+        "list_gather",
+        "list_scatter",
+        "list_read",
+        "list_write",
+        "list_length",
+    }
+
     def __init__(self, op_selector=None):
         super(FP16ComputePrecision, self).__init__(op_selector=op_selector)
-        self.target_dtype = "fp16"
 
-        # Var that feeds into multiple ops will be casted once and cached into this dict
-        # For reference: Checkout test_single_input_to_multiple_operations in `TestFP16CastTransform`.
-        self.cache_vars = {}
+    @property
+    def origin_dtype(self) -> str:
+        return "fp32"
 
-    def fp16_overflow(self, op: Operation) -> bool:
-        # This overflow check consists of two parts:
-        # 1. For valid fp32 numbers (abs < 1e38), we want their exact values,
-        #    so we make sure they are within fp16 range [-65504, 65504]
-        # 2. For inifinities (abs >= 1e38), their exact values does not matter,
-        #    so we can always downcast them to fp16 inf. For example, in attention mask
-        #    we just want -inf to make the masked entries have 0 probability after softmax
+    @property
+    def target_dtype(self) -> str:
+        return "fp16"
+
+    @staticmethod
+    def fp16_overflow(op: Operation) -> bool:
+        """
+        Determines if any of the op's input will overflow when represented by FP16.
+
+        This overflow check consists of two parts:
+        1. For valid fp32 numbers (abs < 1e38), we want their exact values,
+           so we make sure they are within fp16 range [-65504, 65504]
+        2. For inifinities (abs >= 1e38), their exact values does not matter,
+           so we can always downcast them to fp16 inf. For example, in attention mask
+           we just want -inf to make the masked entries have 0 probability after softmax
+        """
         for _, inputs in op.inputs.items():
             is_list_input = isinstance(inputs, (list, tuple))
             if not is_list_input:
@@ -170,18 +315,7 @@ def fp16_overflow(self, op: Operation) -> bool:
 
     def is_valid_op(self, op: Operation) -> bool:
         """Determines if op is valid for fp16 casting."""
-        if op.op_type in ["cast", "while_loop", "cond"]:
-            return False
-
-        # TODO: Remove after supporting FP16 dynamic quantize transformation for list ops (rdar://74458192)
-        if op.op_type in [
-            "make_list",
-            "list_gather",
-            "list_scatter",
-            "list_read",
-            "list_write",
-            "list_length",
-        ]:
+        if op.op_type in self._UNSUPPORTED_FP16_OPS:
             return False
 
         if self.fp16_overflow(op):
@@ -190,13 +324,11 @@ def is_valid_op(self, op: Operation) -> bool:
         return True
 
     def should_cast_parameter(self, op: Operation, param_name: str) -> bool:
-        """Determines if a param of an op should be casted to fp16."""
-        # Make sure the param is valid for fp16 when type domain is specified.
-        type_domain = getattr(op.input_spec.input_types[param_name], "type_domain", None)
-        if type_domain and types.fp16 not in type_domain:
+        """Determines if a param of an op should be cast to fp16."""
+        if not super().should_cast_parameter(op, param_name):
             return False
 
-        if op.opset_version >= AvailableTarget.iOS17:
+        if is_current_opset_version_compatible_with(AvailableTarget.iOS17):
             # In IOS17+ activation ops with alpha/beta support mixed precision, and we don't want to
             # cast alpha/beta to fp16 for better numerical accuracy.
             if op.op_type in self._ACTIVATION_ALPHA_OPS and param_name == "alpha":
@@ -239,80 +371,6 @@ def _check_underflow_to_zero(self, new_var, var):
                 else:
                     new_var._sym_val.val = new_val.reshape(new_var.val.shape)
 
-    def transform_op(self, op):
-        block = op.enclosing_block
-        casted_inputs = {}
-        inputs_modified = False
-
-        for param, inputs in op.inputs.items():
-            # First loop, iterates over all the input parameters of an operation.
-            if not self.should_cast_parameter(op, param):
-                continue
-
-            is_list_input = isinstance(inputs, (list, tuple))
-            if not is_list_input:
-                inputs = [inputs]
-
-            casted_inputs[param] = list(inputs[:])
-            for i, var in enumerate(inputs):
-                # Second loop, iterates over all the vars of a python list corresponding to an input parameter.
-                if not var.is_tensor_or_scalar_of(dtype="fp32"):
-                    continue
-
-                inputs_modified = True
-                casted_var_name = var.name + "_to_fp16"
-                if (
-                    len(var._child_ops) > 1
-                    and casted_var_name in self.cache_vars
-                    and (block.is_var_visible_in_block(self.cache_vars[casted_var_name]))
-                ):
-                    casted_inputs[param][i] = self.cache_vars[casted_var_name]
-                else:
-                    x = mb.cast(x=var, dtype="fp16", name=casted_var_name, before_op=op)
-                    self._check_underflow_to_zero(x, var)
-
-                    casted_inputs[param][i] = x
-                    if len(var._child_ops) > 1:
-                        self.cache_vars[casted_var_name] = casted_inputs[param][i]
-
-            if not is_list_input:
-                casted_inputs[param] = casted_inputs[param][0]
-
-        if inputs_modified:
-            casted_inputs.update({k: v for k, v in op.inputs.items() if k not in casted_inputs})
-            casted_inputs["name"] = op.name + "_cast"
-            casted_inputs["before_op"] = op
-            quant_output = getattr(mb, op.op_type)(**casted_inputs)
-
-            if not isinstance(quant_output, (list, tuple)):
-                quant_output = [quant_output]
-
-            for old_output_var, new_output_var in zip(op.outputs, quant_output):
-                if old_output_var.is_tensor_or_scalar_of(dtype="fp32") and (
-                    not new_output_var.is_tensor_or_scalar_of(dtype="fp32")
-                ):
-                    x = mb.cast(
-                        x=new_output_var,
-                        dtype="fp32",
-                        name=new_output_var.name + "_to_fp32",
-                        before_op=op,
-                    )
-                    op.enclosing_block.replace_uses_of_var_after_op(
-                        anchor_op=op,
-                        old_var=old_output_var,
-                        new_var=x,
-                        force_replace=True,
-                    )
-                else:
-                    op.enclosing_block.replace_uses_of_var_after_op(
-                        anchor_op=op,
-                        old_var=old_output_var,
-                        new_var=new_output_var,
-                        force_replace=True,
-                    )
-
-            block.remove_ops([op])
-
 
 @register_pass(namespace="common")
 class add_fp16_cast(FP16ComputePrecision):
@@ -338,3 +396,61 @@ def skip_ops_by_type(self):
     @skip_ops_by_type.setter
     def skip_ops_by_type(self, criteria: Text):
         self._skip_ops_by_type = set(criteria.split(","))
+
+
+@register_pass(namespace="common")
+class add_int16_cast(CastTypeQuantization):
+    """
+    This transform does the following, for each op that supports int16:
+    - For each input of dtype int32 which actually supports int16, inject a "cast" op to change it
+      to int16 dtype.
+    - For each output of dtype int16, inject a "cast" op to change it back to int32.
+    It's mainly for int16 op ANE residency.
+    """
+    # Ops that prefer int16 params.
+    _PREFER_INT16_OPS: Set[str] = {"gather", "gather_along_axis", "gather_nd"}
+
+    def __init__(self, op_selector=None):
+        super().__init__(op_selector=op_selector)
+
+    @property
+    def origin_dtype(self) -> str:
+        return "int32"
+
+    @property
+    def target_dtype(self) -> str:
+        return "int16"
+
+    @staticmethod
+    def int16_overflow(op: Operation) -> bool:
+        """
+        Determines if any of the op's input will overflow when represented by int16. Constants with
+        values more than np.iinfo(np.int16).max or less than np.iinfo(np.int16).min overflows in int16.
+        """
+        _INT16_MAX = np.iinfo(np.int16).max
+        _INT16_MIN = np.iinfo(np.int16).min
+        for _, inputs in op.inputs.items():
+            is_list_input = isinstance(inputs, (list, tuple))
+            if not is_list_input:
+                inputs = [inputs]
+            for var in inputs:
+                if var.val is not None and var.is_tensor_or_scalar_of(dtype="int32"):
+                    if np.any(var.val > _INT16_MAX) or np.any(var.val < _INT16_MIN):
+                        return True
+
+        # In `gather` and `gather_along_axis`, if the dim size of x is larger than int16 upperbound,
+        # the dynamic indices could overflow.
+        if (
+            op.op_type in {"gather", "gather_along_axis"}
+            and op.indices.val is None
+            and op.x.shape is not None
+        ):
+            dim_size = op.x.shape[op.axis.val]
+            if not is_symbolic(dim_size) and dim_size > _INT16_MAX:
+                return True
+
+        return False
+
+    def is_valid_op(self, op: Operation) -> bool:
+        """Determines if op is valid for int16 casting."""
+        return op.op_type in self._PREFER_INT16_OPS and not self.int16_overflow(op)
diff --git a/coremltools/converters/mil/mil/passes/pass_pipeline.py b/coremltools/converters/mil/mil/passes/pass_pipeline.py
index 085269b14..d67e64d8f 100644
--- a/coremltools/converters/mil/mil/passes/pass_pipeline.py
+++ b/coremltools/converters/mil/mil/passes/pass_pipeline.py
@@ -50,6 +50,7 @@
     "common::fuse_gelu_exact",
     "common::fuse_leaky_relu",
     "common::rank0_expand_dims_swap",
+    "common::fuse_squeeze_expand_dims",
     "common::compose_conv1d",  # compose conv1d before any other conv passes
     "common::use_reflection_padding",
     "common::merge_consecutive_paddings",
@@ -91,6 +92,7 @@
     # which detects patterns that involve redundant ops ("sub") etc.
     "common::remove_redundant_ops",
     "common::add_fp16_cast",  # Will be removed if compute precision is not FP16.
+    "common::add_int16_cast",  # Will be removed if compute precision is not FP16.
     "common::dead_code_elimination",  # always end with dce
 ]
 
@@ -98,8 +100,11 @@
     "common::dead_code_elimination",
     "common::const_elimination",
     "common::cast_optimization",
+    "common::dead_code_elimination",  # must follow cast_optimization
     "common::const_elimination",
     "common::const_deduplication",  # after all consts have been settled
+    "common::dead_code_elimination",  # come before merge_tensorwise_affine_dequantize_with_consecutive_ops
+    "common::merge_tensorwise_affine_dequantize_with_consecutive_ops",  # after const_deduplication and dead_code_elimination
     "common::loop_invariant_elimination",
     "common::noop_elimination",
     "common::dedup_op_and_var_names",
@@ -371,6 +376,11 @@ def get_pipeline(cls, pipeline_name: Text) -> PassPipeline:
             )
         return PassPipeline(cls._PIPELINE_NAME_TO_PASSES[pipeline_name], pipeline_name)
 
+    @classmethod
+    def list_available_pipelines(cls) -> List[str]:
+        """List all available pipelines."""
+        return list(cls._PIPELINE_NAME_TO_PASSES.keys())
+
     """
     =======================================
     Pre-defined PassPipeline configurations
diff --git a/coremltools/converters/mil/mil/passes/tests/test_pass_pipeline.py b/coremltools/converters/mil/mil/passes/tests/test_pass_pipeline.py
index d48103a55..58687cea8 100644
--- a/coremltools/converters/mil/mil/passes/tests/test_pass_pipeline.py
+++ b/coremltools/converters/mil/mil/passes/tests/test_pass_pipeline.py
@@ -111,3 +111,9 @@ def test_get_invalid_pipeline(self):
             match="There is no pipeline for `invalid`.",
         ):
             PassPipeline.get_pipeline("invalid")
+
+    def test_list_available_pipelines(self):
+        available_pipelines = PassPipeline.list_available_pipelines()
+        assert len(available_pipelines) == 12
+        assert "default" in available_pipelines
+        assert "default_palettization" in available_pipelines
diff --git a/coremltools/converters/mil/mil/passes/tests/test_passes.py b/coremltools/converters/mil/mil/passes/tests/test_passes.py
index f1aa8598d..406507185 100644
--- a/coremltools/converters/mil/mil/passes/tests/test_passes.py
+++ b/coremltools/converters/mil/mil/passes/tests/test_passes.py
@@ -248,6 +248,83 @@ def _false_fn():
         assert_op_count_match(prog, expect=6, op="const")
 
 
+class TestFuseSqueezeExpandDims:
+    @pytest.mark.parametrize(
+        "rank",
+        [1, 5],
+    )
+    def test_fuse_squeeze_expand_dims_basic(self, rank):
+        """
+        Given:
+          %1 = squeeze(%x)
+          %2 = expand_dims(%1)
+          %3 = relu(%2)
+
+        Result:
+          %3 = relu(%x)
+        """
+        if rank == 1:
+            input_shape = (1,)
+            axes = (0,)
+        else:
+            assert rank == 5
+            input_shape = (3, 1, 4, 1, 1)
+            axes = (1, 3, 4)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=input_shape)])
+        def prog(x):
+            x = mb.squeeze(x=x, axes=axes)
+            x = mb.expand_dims(x=x, axes=axes)
+            return mb.relu(x=x)
+
+        # fuse_squeeze_expand_dims fused squeeze + expand_dims into identity
+        apply_pass_and_basic_check(prog, "common::fuse_squeeze_expand_dims")
+        assert get_op_types_in_program(prog) == ["identity", "relu"]
+
+        # noop_elimination can further remove the identity op
+        apply_pass_and_basic_check(prog, "common::noop_elimination")
+        assert get_op_types_in_program(prog) == ["relu"]
+
+    def test_fuse_squeeze_expand_dims_negative(self):
+        """
+        If squeeze and expand_dims cannot cancel each other,
+        the graph pass does nothing
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 1, 4, 1, 1))])
+        def prog(x):
+            x = mb.squeeze(x=x, axes=(1, 2))
+            x = mb.expand_dims(x=x, axes=(1, 3))
+            return mb.relu(x=x)
+
+        apply_pass_and_basic_check(prog, "common::fuse_squeeze_expand_dims")
+        assert get_op_types_in_program(prog) == ["squeeze", "expand_dims", "relu"]
+
+    def test_fuse_squeeze_expand_dims_connected_output(self):
+        """
+        If squeeze is connected to block output, it cannot be removed.
+        However, the expand_dims can be a block output.
+        """
+        # squeeze connected to output. Nothing happens.
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1,))])
+        def prog(x):
+            squeeze = mb.squeeze(x=x, axes=(0,))
+            expand_dims = mb.expand_dims(x=squeeze, axes=(0,))
+            return mb.relu(x=expand_dims), squeeze
+
+        apply_pass_and_basic_check(prog, "common::fuse_squeeze_expand_dims")
+        assert get_op_types_in_program(prog) == ["squeeze", "expand_dims", "relu"]
+
+        # expand_dims connected to output. Still good to fuse.
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1,))])
+        def prog(x):
+            squeeze = mb.squeeze(x=x, axes=(0,))
+            expand_dims = mb.expand_dims(x=squeeze, axes=(0,))
+            return mb.relu(x=expand_dims), expand_dims
+
+        apply_pass_and_basic_check(prog, "common::fuse_squeeze_expand_dims")
+        assert get_op_types_in_program(prog) == ["identity", "relu"]
+
 class TestConstElimination:
     def test_const_elimination(self):
         @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
@@ -3261,7 +3338,7 @@ def prog(x):
             return x
         prev_prog, _, block = apply_pass_and_basic_check(prog, "common::expand_high_rank_reshape_and_transpose")
 
-        prog._check_invalid_program()
+        prog._check_early_error_out_for_invalid_program()
         assert get_op_types_in_program(prog) == ["reshape", "transpose", "reshape"]
         TestExpandHighRankReshapeAndTranspose._test_numerical(prev_prog, input_shape, reshape_shape, perm, output_shape)
 
@@ -3279,7 +3356,7 @@ def prog(x):
             return x
         prev_prog, _, block = apply_pass_and_basic_check(prog, "common::expand_high_rank_reshape_and_transpose")
 
-        prog._check_invalid_program()
+        prog._check_early_error_out_for_invalid_program()
         assert get_op_types_in_program(prog) == ["reshape", "transpose", "reshape"]
         TestExpandHighRankReshapeAndTranspose._test_numerical(prev_prog, input_shape, reshape_shape, perm, output_shape)
 
@@ -3298,7 +3375,7 @@ def prog(x):
 
         prev_prog, _, block = apply_pass_and_basic_check(prog, "common::expand_high_rank_reshape_and_transpose")
 
-        prog._check_invalid_program()
+        prog._check_early_error_out_for_invalid_program()
         assert get_op_types_in_program(prog) == ["reshape", "transpose"] * 16 + ["reshape"]
         TestExpandHighRankReshapeAndTranspose._test_numerical(prev_prog, input_shape, reshape_shape, perm, output_shape)
 
@@ -3318,7 +3395,7 @@ def prog(x):
         prev_prog, _, block = apply_pass_and_basic_check(prog, "common::expand_high_rank_reshape_and_transpose")
 
         with pytest.raises(ValueError, match="Core ML only supports tensors with rank <= 5"):
-            prog._check_invalid_program()
+            prog._check_early_error_out_for_invalid_program()
 
 
 class TestMergeConsecutiveRelus:
diff --git a/coremltools/converters/mil/mil/passes/tests/test_quantization_passes.py b/coremltools/converters/mil/mil/passes/tests/test_quantization_passes.py
index 5d0c43333..b1cbcf52f 100644
--- a/coremltools/converters/mil/mil/passes/tests/test_quantization_passes.py
+++ b/coremltools/converters/mil/mil/passes/tests/test_quantization_passes.py
@@ -9,10 +9,11 @@
 import numpy as np
 import parameterized
 import pytest
+from mock import patch
 
 import coremltools as ct
 import coremltools.converters.mil.mil.types as types
-from coremltools._deps import _IS_MACOS
+from coremltools._deps import _HAS_TORCH, _IS_MACOS, MSG_TORCH_NOT_FOUND
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil.passes.defs import quantization
 from coremltools.converters.mil.mil.types import numpy_type_to_builtin_type
@@ -22,9 +23,282 @@
     get_op_types_in_program,
 )
 
+if _HAS_TORCH:
+    import torch
+    import torch.nn as nn
+
 np.random.seed(1818)
 
 
+class TestTensorwiseAffineDequantizeConstElimination:
+    def test_eliminate_transpose(self):
+        """
+        Input graph:
+            data -> constexpr_affine_dequantize -> transpose
+
+        Output graph:
+            new_data -> constexpr_affine_dequantize
+
+        where new_data is the value after applying transpose to data
+        """
+        quantized_data = np.random.randint(0, 256, (1, 2, 3, 4)).astype(np.int8)
+
+        @mb.program(input_specs=[], opset_version=ct.target.iOS16)
+        def prog():
+            res = mb.constexpr_affine_dequantize(
+                quantized_data=quantized_data,
+                axis=0,
+                scale=8.9,
+                zero_point=np.int8(34),
+            )
+            return mb.transpose(x=res, perm=(2, 0, 1, 3))
+
+        apply_pass_and_basic_check(
+            prog, "common::merge_tensorwise_affine_dequantize_with_consecutive_ops"
+        )
+        assert get_op_types_in_program(prog) == ["constexpr_affine_dequantize"]
+
+        new_op = prog.find_ops(op_type="constexpr_affine_dequantize", exactly_one=True)[0]
+        expected_quantized_data = np.transpose(quantized_data, (2, 0, 1, 3))
+        np.testing.assert_array_equal(new_op.quantized_data.val, expected_quantized_data)
+
+    def test_eliminate_reshape(self):
+        """
+        Input graph:
+            data -> constexpr_affine_dequantize -> reshape
+
+        Output graph:
+            new_data -> constexpr_affine_dequantize
+
+        where new_data is the value after applying reshape to data
+        """
+        quantized_data = np.random.randint(0, 256, (1, 2, 3, 4)).astype(np.int8)
+
+        @mb.program(input_specs=[], opset_version=ct.target.iOS16)
+        def prog():
+            res = mb.constexpr_affine_dequantize(
+                quantized_data=quantized_data,
+                axis=0,
+                scale=8.9,
+                zero_point=np.int8(34),
+            )
+            return mb.reshape(x=res, shape=(3, -1))
+
+        apply_pass_and_basic_check(
+            prog, "common::merge_tensorwise_affine_dequantize_with_consecutive_ops"
+        )
+        assert get_op_types_in_program(prog) == ["constexpr_affine_dequantize"]
+
+        new_op = prog.find_ops(op_type="constexpr_affine_dequantize", exactly_one=True)[0]
+        expected_quantized_data = np.reshape(quantized_data, (3, 8))
+        np.testing.assert_array_equal(new_op.quantized_data.val, expected_quantized_data)
+
+    def test_eliminate_expand_dims(self):
+        """
+        Input graph:
+            data -> constexpr_affine_dequantize -> expand_dims
+
+        Output graph:
+            new_data -> constexpr_affine_dequantize
+
+        where new_data is the value after applying expand_dims to data
+        """
+        quantized_data = np.random.randint(0, 256, (2, 3, 4)).astype(np.int8)
+
+        @mb.program(input_specs=[], opset_version=ct.target.iOS16)
+        def prog():
+            res = mb.constexpr_affine_dequantize(
+                quantized_data=quantized_data,
+                axis=0,
+                scale=8.9,
+                zero_point=np.int8(34),
+            )
+            return mb.expand_dims(x=res, axes=(0, 2, 4))
+
+        apply_pass_and_basic_check(
+            prog, "common::merge_tensorwise_affine_dequantize_with_consecutive_ops"
+        )
+        assert get_op_types_in_program(prog) == ["constexpr_affine_dequantize"]
+
+        new_op = prog.find_ops(op_type="constexpr_affine_dequantize", exactly_one=True)[0]
+        expected_quantized_data = np.expand_dims(quantized_data, axis=(0, 2, 4))
+        np.testing.assert_array_equal(new_op.quantized_data.val, expected_quantized_data)
+
+    @pytest.mark.parametrize("axis", [(0, 3), None])
+    def test_eliminate_squeeze(self, axis):
+        """
+        Input graph:
+            data -> constexpr_affine_dequantize -> squeeze
+
+        Output graph:
+            new_data -> constexpr_affine_dequantize
+
+        where new_data is the value after applying squeeze to data
+        """
+        quantized_data = np.random.randint(0, 256, (1, 2, 3, 1, 4)).astype(np.int8)
+
+        @mb.program(input_specs=[], opset_version=ct.target.iOS16)
+        def prog():
+            res = mb.constexpr_affine_dequantize(
+                quantized_data=quantized_data,
+                axis=0,
+                scale=8.9,
+                zero_point=np.int8(34),
+            )
+            return mb.squeeze(x=res, axes=axis)
+
+        apply_pass_and_basic_check(
+            prog, "common::merge_tensorwise_affine_dequantize_with_consecutive_ops"
+        )
+        assert get_op_types_in_program(prog) == ["constexpr_affine_dequantize"]
+
+        new_op = prog.find_ops(op_type="constexpr_affine_dequantize", exactly_one=True)[0]
+        expected_quantized_data = np.squeeze(quantized_data, axis=axis)
+        np.testing.assert_array_equal(new_op.quantized_data.val, expected_quantized_data)
+
+    def test_eliminate_multiple_ops(self):
+        """
+        Input graph:
+            data -> constexpr_affine_dequantize -> transpose ->
+            reshape -> expand_dims -> squeeze
+
+        Output graph:
+            new_data -> constexpr_affine_dequantize
+
+        where new_data is the value after applying the same chain of transformations to data
+        """
+        quantized_data = np.random.randint(0, 256, (1, 2, 3, 4)).astype(np.int8)
+
+        @mb.program(input_specs=[], opset_version=ct.target.iOS16)
+        def prog():
+            res = mb.constexpr_affine_dequantize(
+                quantized_data=quantized_data,
+                axis=0,
+                scale=8.9,
+                zero_point=np.int8(34),
+            )
+            res = mb.transpose(x=res, perm=(1, 0, 3, 2))
+            res = mb.reshape(x=res, shape=(8, 3))
+            res = mb.expand_dims(x=res, axes=(0, 2, 4))
+            return mb.squeeze(x=res, axes=(2,))
+
+        apply_pass_and_basic_check(
+            prog, "common::merge_tensorwise_affine_dequantize_with_consecutive_ops"
+        )
+        assert get_op_types_in_program(prog) == ["constexpr_affine_dequantize"]
+
+        new_op = prog.find_ops(op_type="constexpr_affine_dequantize", exactly_one=True)[0]
+
+        expected_quantized_data = np.transpose(quantized_data, (1, 0, 3, 2))
+        expected_quantized_data = np.reshape(expected_quantized_data, (8, 3))
+        expected_quantized_data = np.expand_dims(expected_quantized_data, (0, 2, 4))
+        expected_quantized_data = np.squeeze(expected_quantized_data, (2,))
+
+        np.testing.assert_array_equal(new_op.quantized_data.val, expected_quantized_data)
+
+    def test_negative_channel_wise_pattern(self):
+        """
+        If ``constexpr_affine_dequantize`` is not tensor-wise,
+        the graph is not changed.
+        """
+        quantized_data = np.random.randint(0, 256, (2, 3, 4)).astype(np.int8)
+
+        @mb.program(input_specs=[], opset_version=ct.target.iOS16)
+        def prog():
+            x = mb.constexpr_affine_dequantize(
+                quantized_data=quantized_data,
+                axis=0,
+                scale=[8.9, 6.5],
+                zero_point=np.int8(34),
+            )
+            y = mb.constexpr_affine_dequantize(
+                quantized_data=quantized_data,
+                axis=0,
+                scale=8.9,
+                zero_point=np.int8([34, 56]),
+            )
+            return mb.transpose(x=x, perm=(1, 0, 2)), mb.transpose(x=y, perm=(1, 0, 2))
+
+        apply_pass_and_basic_check(
+            prog, "common::merge_tensorwise_affine_dequantize_with_consecutive_ops"
+        )
+        assert get_op_types_in_program(prog) == [
+            "constexpr_affine_dequantize",
+            "constexpr_affine_dequantize",
+            "transpose",
+            "transpose",
+        ]
+
+    def test_negative_non_linked_list_pattern(self):
+        """
+        If ``quantized_data`` feeds into multiple ``constexpr_affine_dequantize`` ops,
+        the graph will not be changed.
+        """
+        quantized_data = np.random.randint(0, 256, (2, 3, 4)).astype(np.int8)
+
+        @mb.program(input_specs=[], opset_version=ct.target.iOS16)
+        def prog():
+            data = mb.const(val=quantized_data)
+            x = mb.constexpr_affine_dequantize(
+                quantized_data=data,
+                axis=0,
+                scale=8.9,
+                zero_point=np.int8(34),
+            )
+            y = mb.constexpr_affine_dequantize(
+                quantized_data=data,
+                axis=0,
+                scale=8.1,
+                zero_point=np.int8(56),
+            )
+            return mb.transpose(x=x, perm=(1, 0, 2)), mb.reshape(x=y, shape=(24,))
+
+        apply_pass_and_basic_check(
+            prog, "common::merge_tensorwise_affine_dequantize_with_consecutive_ops"
+        )
+        assert get_op_types_in_program(prog) == [
+            "constexpr_affine_dequantize",
+            "constexpr_affine_dequantize",
+            "transpose",
+            "reshape",
+        ]
+
+    def test_eliminate_connected_outputs(self):
+        """
+        The optimization stops when the node is a block output
+        """
+        quantized_data = np.random.randint(0, 256, (2, 3, 4)).astype(np.int8)
+
+        @mb.program(input_specs=[], opset_version=ct.target.iOS16)
+        def prog():
+            x = mb.constexpr_affine_dequantize(
+                quantized_data=quantized_data,
+                axis=0,
+                scale=8.9,
+                zero_point=np.int8(34),
+            )
+            x = mb.transpose(x=x, perm=(1, 0, 2))
+            x = mb.reshape(x=x, shape=(2, 2, 3, 2))
+            y = mb.transpose(x=x, perm=(0, 3, 2, 1))
+            return x, y
+
+        apply_pass_and_basic_check(
+            prog, "common::merge_tensorwise_affine_dequantize_with_consecutive_ops"
+        )
+        assert get_op_types_in_program(prog) == [
+            "constexpr_affine_dequantize",
+            "transpose",
+        ]
+
+        new_op = prog.find_ops(op_type="constexpr_affine_dequantize", exactly_one=True)[0]
+        expected_quantized_data = np.transpose(quantized_data, (1, 0, 2))
+        expected_quantized_data = np.reshape(expected_quantized_data, (2, 2, 3, 2))
+        np.testing.assert_array_equal(new_op.quantized_data.val, expected_quantized_data)
+
+        transpose_op = prog.find_ops(op_type="transpose", exactly_one=True)[0]
+        assert transpose_op.perm.val.tolist() == [0, 3, 2, 1]
+
+
 class QuantizationBaseTest:
     @staticmethod
     def generate_random_quantization_params(
@@ -188,7 +462,10 @@ def prog(x):
             quantize_1_1 = mb.quantize(input=reshape, scale=0.1, output_dtype="int8")
             dequantize_2_1 = mb.dequantize(input=quantize_1_1, scale=0.1)
 
-            return dequantize_2_0, dequantize_2_1, 
+            return (
+                dequantize_2_0,
+                dequantize_2_1,
+            )
 
         prev_prog, _, block = apply_pass_and_basic_check(prog, "common::int_op_canonicalization")
         if all_are_int:
@@ -1917,3 +2194,232 @@ def prog(x):
             backend=("mlprogram", "fp16"),
             minimum_deployment_target=opset_version,
         )
+
+
+class TestInt32CastToInt16:
+    @pytest.mark.parametrize(
+        "x_dtype, dynamic, opset_version",
+        itertools.product(
+            [np.int32, np.float32],
+            [True, False],
+            [ct.target.iOS15, ct.target.iOS16, ct.target.iOS17],
+        ),
+    )
+    def test_gather_int16_indices(self, x_dtype, dynamic, opset_version):
+        @mb.program(opset_version=opset_version)
+        def prog_static():
+            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=x_dtype)
+            indices = np.array([1, 0], dtype=np.int32)
+            return mb.gather(x=params, indices=indices, axis=-1)
+
+        @mb.program(
+            [
+                mb.TensorSpec(shape=(2, 3), dtype=types.numpy_type_to_builtin_type(x_dtype)),
+                mb.TensorSpec(shape=(2,), dtype=types.int32),
+            ],
+            opset_version=opset_version,
+        )
+        def prog_dynamic(x, indices):
+            return mb.gather(x=x, indices=indices, axis=0)
+
+        prog = prog_dynamic if dynamic else prog_static
+        assert get_op_types_in_program(prog) == ["gather"]
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::add_int16_cast")
+
+        if opset_version <= ct.target.iOS16:
+            # iOS15 gather op's ``indices`` doesn't support int16, so this pass doesn't have effect.
+            # iOS16 cast op doesn't support int16, so this pass doesn't have effect.
+            assert get_op_types_in_program(prog) == get_op_types_in_program(prev_prog)
+        else:
+            # When input ``x`` is float32, the output is also float32, so no cast for output.
+            # When input ``x`` is int32 and cast to int16, the output will also be int16, so there
+            # is another cast op to cast it back to int32.
+            expected_ops = ["cast", "gather"]
+            if x_dtype == np.int32:
+                expected_ops = ["cast", "cast", "gather", "cast"]
+            assert get_op_types_in_program(prog) == expected_ops
+            indices_cast_op_idx = 1 if x_dtype == np.int32 else 0
+            cast_op = block.find_ops(op_type="cast")[indices_cast_op_idx]
+            assert cast_op.dtype.val == "int16"
+            assert len(cast_op.outputs) == 1
+            assert len(cast_op.outputs[0].child_ops) == 1
+            assert cast_op.outputs[0].child_ops[0].op_type == "gather"
+            assert cast_op.outputs[0] == block.find_ops(op_type="gather")[0].indices
+
+        if not dynamic:
+            np.testing.assert_allclose(
+                np.array([[2, 1], [5, 4]], dtype=np.float32),
+                prog.functions["main"].find_ops(op_type="gather")[0].outputs[0].val,
+                atol=1e-04,
+                rtol=1e-05,
+            )
+
+    @pytest.mark.parametrize(
+        "x_dtype, dynamic, opset_version",
+        itertools.product(
+            [np.int32, np.float32],
+            [True, False],
+            [ct.target.iOS15, ct.target.iOS16, ct.target.iOS17],
+        ),
+    )
+    def test_gather_along_axis_int16_indices(self, x_dtype, dynamic, opset_version):
+        @mb.program(opset_version=opset_version)
+        def prog_static():
+            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=x_dtype)
+            indices = np.array([[1, 0, 1], [1, 1, 0]], dtype=np.int32)
+            return mb.gather_along_axis(x=params, indices=indices, axis=-1)
+
+        @mb.program(
+            [
+                mb.TensorSpec(shape=(2, 3), dtype=types.numpy_type_to_builtin_type(x_dtype)),
+                mb.TensorSpec(shape=(2, 3), dtype=types.int32),
+            ],
+            opset_version=opset_version,
+        )
+        def prog_dynamic(x, indices):
+            return mb.gather_along_axis(x=x, indices=indices, axis=0)
+
+        prog = prog_dynamic if dynamic else prog_static
+        assert get_op_types_in_program(prog) == ["gather_along_axis"]
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::add_int16_cast")
+
+        if opset_version <= ct.target.iOS16:
+            # iOS15 gather op's ``indices`` doesn't support int16, so this pass doesn't have effect.
+            # iOS16 cast op doesn't support int16, so this pass doesn't have effect.
+            assert get_op_types_in_program(prog) == get_op_types_in_program(prev_prog)
+        else:
+            # When input ``x`` is float32, the output is also float32, so no cast for output.
+            # When input ``x`` is int32 and cast to int16, the output will also be int16, so there
+            # is another cast op to cast it back to int32.
+            expected_ops = ["cast", "gather_along_axis"]
+            if x_dtype == np.int32:
+                expected_ops = ["cast", "cast", "gather_along_axis", "cast"]
+            assert get_op_types_in_program(prog) == expected_ops
+            indices_cast_op_idx = 1 if x_dtype == np.int32 else 0
+            cast_op = block.find_ops(op_type="cast")[indices_cast_op_idx]
+            assert cast_op.dtype.val == "int16"
+            assert len(cast_op.outputs) == 1
+            assert len(cast_op.outputs[0].child_ops) == 1
+            assert cast_op.outputs[0].child_ops[0].op_type == "gather_along_axis"
+            assert cast_op.outputs[0] == block.find_ops(op_type="gather_along_axis")[0].indices
+
+        if not dynamic:
+            np.testing.assert_allclose(
+                np.array([[2, 1, 2], [5, 5, 4]], dtype=np.float32),
+                prog.functions["main"].find_ops(op_type="gather_along_axis")[0].outputs[0].val,
+                atol=1e-04,
+                rtol=1e-05,
+            )
+
+    @pytest.mark.parametrize("overflow", [True, False])
+    def test_gather_dynamic_overflow_int16(self, overflow):
+        """Dynamic input indices should also be cast if x dim size doesn't overflow int16 range."""
+
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(32769 if overflow else 2, 3)),
+                mb.TensorSpec(shape=(2,), dtype=types.int32),
+            ],
+            opset_version=ct.target.iOS17,
+        )
+        def prog(x, indices):
+            return mb.gather(x=x, indices=indices, axis=0)
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::add_int16_cast")
+        if overflow:
+            assert get_op_types_in_program(prog) == get_op_types_in_program(prev_prog)
+        else:
+            assert get_op_types_in_program(prog) == ["cast", "gather"]
+            cast_op = block.find_ops(op_type="cast")[0]
+            assert cast_op.dtype.val == "int16"
+            assert cast_op.outputs[0] == block.find_ops(op_type="gather")[0].indices
+
+    def test_gather_static_overflow_int16(self):
+        """Indices cannot be represented by int16 range, don't cast to int16."""
+
+        @mb.program(opset_version=ct.target.iOS17)
+        def prog():
+            params = np.array([[1, 2]] * 32769, dtype=np.float32)
+            indices = np.array([32768, 0], dtype=np.int32)
+            return mb.gather(x=params, indices=indices, axis=0)
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::add_int16_cast")
+        assert get_op_types_in_program(prog) == get_op_types_in_program(prev_prog)
+
+    @patch(
+        "coremltools.converters.mil.mil.passes.defs.quantization.add_int16_cast._PREFER_INT16_OPS",
+        set(),
+    )
+    def test_int16_no_effect(self):
+        """After patching the pass, no op should be cast to int16"""
+
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(2, 3)), mb.TensorSpec(shape=(2,), dtype=types.int32)],
+            opset_version=ct.target.iOS17,
+        )
+        def prog(x, indices):
+            return mb.gather(x=x, indices=indices, axis=0)
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::add_int16_cast")
+        assert get_op_types_in_program(prog) == get_op_types_in_program(prev_prog)
+
+    @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
+    @pytest.mark.parametrize(
+        "compute_precision, num_embeddings, minimum_deployment_target, symbolic",
+        itertools.product(
+            [ct.precision.FLOAT16, ct.precision.FLOAT32],
+            [10, 32769],
+            [ct.target.iOS15, ct.target.iOS16, ct.target.iOS17],
+            [True, False],
+        ),
+    )
+    def test_int16_embedding_e2e(
+        self, compute_precision, num_embeddings, minimum_deployment_target, symbolic
+    ):
+        """End-to-end conversion from a torch embedding model."""
+
+        class EmbeddingModel(nn.Module):
+            def __init__(self):
+                super(EmbeddingModel, self).__init__()
+                self.embedding = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=2)
+
+            def forward(self, x):
+                return self.embedding(x)
+
+        input_data = np.random.randint(low=0, high=num_embeddings, size=(3, 5))
+        input_data = torch.from_numpy(input_data)
+        model = EmbeddingModel()
+        model.eval()
+        traced_model = torch.jit.trace(model, input_data)
+        input_shape = (ct.RangeDim(1, 32), ct.RangeDim(1, 32)) if symbolic else input_data.shape
+        converted_model = ct.convert(
+            traced_model,
+            inputs=[ct.TensorType(shape=input_shape, name="input", dtype=np.int32)],
+            convert_to="mlprogram",
+            compute_precision=compute_precision,
+            compute_units=ct.ComputeUnit.CPU_ONLY,
+            minimum_deployment_target=minimum_deployment_target,
+        )
+        prog = converted_model._mil_program
+
+        # The embedding layer is lowered to `gather` op.
+        expected_ops = ["gather"]
+        if (
+            compute_precision == ct.precision.FLOAT16
+            and minimum_deployment_target < ct.target.iOS16
+        ):
+            # Cast from fp16 to fp32 because fp16 is not supported in I/O before iOS16.
+            expected_ops.append("cast")
+        if (
+            minimum_deployment_target >= ct.target.iOS17
+            and compute_precision == ct.precision.FLOAT16
+            and num_embeddings <= np.iinfo(np.int16).max
+        ):
+            # The int16 cast only happens for iOS17+ with fp16 precision and there is no overflow.
+            expected_ops.insert(0, "cast")
+            cast_op = prog["main"].find_ops(op_type="cast")[0]
+            assert cast_op.dtype.val == "int16"
+            assert cast_op.outputs[0] == prog["main"].find_ops(op_type="gather")[0].indices
+        assert get_op_types_in_program(prog) == expected_ops
diff --git a/coremltools/converters/mil/mil/program.py b/coremltools/converters/mil/mil/program.py
index fe103d20b..462e88ffc 100644
--- a/coremltools/converters/mil/mil/program.py
+++ b/coremltools/converters/mil/mil/program.py
@@ -3,6 +3,9 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+from collections import defaultdict
+from typing import Dict, List
+
 import numpy as _np
 import sympy as _sm
 
@@ -15,6 +18,7 @@
 
 from . import types
 from .block import Function
+from .operation import Operation
 from .types.symbolic import k_num_internal_syms, k_used_symbols
 from .var import Var
 
@@ -24,6 +28,13 @@ class Program:
     def _get_opset_str_value(op):
         return f"coremltools.target.{op.name}"
 
+    @staticmethod
+    def _get_supported_dialect_opset() -> List[str]:
+        """
+        Return a list of supported dialect opsets at runtime.
+        """
+        return []
+
     def __init__(self):
         self.main_input_types = []
         self.main_output_types = None
@@ -31,22 +42,32 @@ def __init__(self):
         self.parameters = {}
         self.skip_all_passes = False
 
+    def _get_dialect_namespaces(self) -> Dict[str, List[Operation]]:
+        """
+        Return a dict which maps the dialect namespace into a list of corresponding operations.
+        """
+        res = defaultdict(list)
+
+        def get_dialect_namespaces_block(block):
+            for op in list(block.operations):
+                for b in op.blocks:
+                    get_dialect_namespaces_block(b)
+                if hasattr(op, "_dialect_namespace"):
+                    dialect_namespace = op._dialect_namespace
+                    res[dialect_namespace].append(op)
+
+        for func in self.functions.values():
+            get_dialect_namespaces_block(func)
+        return res
+
     def _get_max_opset_version_and_op(self):
         max_opset_version = _target.iOS13
         op_with_max_opset_version = None
-        def update_max_opset_version_block(block):
-            nonlocal max_opset_version
-            nonlocal op_with_max_opset_version
-            for op in list(block.operations):
-                for b in op.blocks:
-                    update_max_opset_version_block(b)
-                if not hasattr(op, "_op_variants") or not isinstance(op._op_variants, dict):
-                    continue
-                if op.opset_version > max_opset_version:
-                    max_opset_version = op.opset_version
-                    op_with_max_opset_version = op
         for func in self.functions.values():
-            update_max_opset_version_block(func)
+            cur_max_opset, cur_op = func.get_max_opset_version_and_op()
+            if cur_max_opset > max_opset_version:
+                max_opset_version = cur_max_opset
+                op_with_max_opset_version = cur_op
         return max_opset_version, op_with_max_opset_version
 
     def _check_ops_version_compatibility(self, max_opset_version):
@@ -95,24 +116,57 @@ def _check_program_opset_version(self):
         self._check_ops_version_compatibility(max_opset_version)
         self._check_or_set_functions_opset_version(max_opset_version)
 
-    def _check_invalid_program(self):
+    @staticmethod
+    def _get_runtime_supported_dialect_opset() -> List[str]:
         """
-        Early error out for
-        1. tensor with rank >= 6
-        2. non const tensor feed in const input
+        Return a list of supported dialect opsets at runtime.
         """
+        return []
 
+    def _check_invalid_opset(self):
+        """
+        Check if the program consists of opsets not supported by runtime.
+        """
+        dialect_namespaces = self._get_dialect_namespaces()
+        if len(dialect_namespaces) != 0:
+            for dialect_key in list(dialect_namespaces.keys()):
+                if dialect_key not in self._get_runtime_supported_dialect_opset():
+                    invalid_op = dialect_namespaces[dialect_key][0]
+                    raise ValueError(
+                        f'Core ML only support core opset. Got unsupported op "{invalid_op.name}" with type "{invalid_op.op_type}" of dialect namespace "{invalid_op._dialect_namespace}".'
+                    )
+
+    def _check_invalid_tensor_rank(self):
+        """
+        Check if the program consists of tensors with rank >= 6.
+        """
         def _check_invalid_tensor_rank_block(block):
             for op in block.operations:
                 for b in op.blocks:
                     _check_invalid_tensor_rank_block(b)
                 for o in op.outputs:
                     if not isinstance(o, ListVar) and (o.rank < 0 or o.rank >= 6):
+                        if op.op_type == "const" and len(o.child_ops) == 1 and \
+                                o.child_ops[0].op_type == "constexpr_lut_to_dense":
+                            # For lut op, the lookup table is allowed to have rank > 5.
+                            continue
                         raise ValueError(
                             f'Core ML only supports tensors with rank <= 5. Layer "{op.name}", '
                             f'with type "{op.op_type}", outputs a rank {o.rank} tensor. '
                         )
 
+        for f in self.functions.values():
+            _check_invalid_tensor_rank_block(f)
+
+    def _check_invalid_const_tensor_input(self):
+        """
+        Check if non const tensor feed into const input.
+        This might happen in the early stage of conversion, for instance:
+            constexpr_ -> reshape -> transpose -> linear
+
+        However, the pattern is optimized into the following in a graph pass.
+            constexpr_ -> linear
+        """
         def _check_invalid_const_tensor_input_block(block):
             for op in block.operations:
                 for b in op.blocks:
@@ -130,12 +184,20 @@ def _check_invalid_const_tensor_input_block(block):
                             f"In op {op.name}. Input {k} ({v.name}) must be const or constexpr ops."
                         )
 
-        for f in self.functions.values():
-            _check_invalid_tensor_rank_block(f)
-
         for f in self.functions.values():
             _check_invalid_const_tensor_input_block(f)
 
+    def _check_early_error_out_for_invalid_program(self):
+        """
+        Early error out for
+        1. tensor with rank >= 6
+        2. non const tensor feed into const input
+        3. program consist of non mil core ops
+        """
+        self._check_invalid_tensor_rank()
+        self._check_invalid_const_tensor_input()
+        self._check_invalid_opset()
+
     def add_function(self, name, ssa_func):
         if not isinstance(ssa_func, Function):
             raise ValueError("Only Function can be added to Program.")
@@ -229,15 +291,8 @@ def __init__(self, sym_shape, dtype=None, name=None, allow_rank0_input=False):
         self.dtype = dtype
         if self.dtype is None:
             self.dtype = types.float
-        sym_type = self.type_inference()
-
-        # Globally unique var name for placeholders
-        if name is None:
-            name = 'placeholder_' + str(self.__class__.counter)
-            self.__class__.counter += 1
-
-        # List of output vars (consistent w/ other ops)
-        self.outputs = [Var(name, sym_type)]
+        self.name = name
+        self._infer_output_var()
 
     def set_name(self, name):
         self.name = name
@@ -251,6 +306,16 @@ def type_inference(self):
     def __str__(self):
         return str(self.outputs[0])
 
+    def _infer_output_var(self):
+        sym_type = self.type_inference()
+
+        # Globally unique var name for placeholders
+        if self.name is None:
+            self.name = f"{self.__class__.__name__}_{self.__class__.counter}"
+            self.__class__.counter += 1
+
+        # List of output vars (consistent w/ other ops)
+        self.outputs = [Var(self.name, sym_type)]
 
 def get_new_variadic_symbol():
     global k_num_internal_syms
diff --git a/coremltools/converters/mil/mil/tests/test_block.py b/coremltools/converters/mil/mil/tests/test_block.py
index d0674920d..0f943c710 100644
--- a/coremltools/converters/mil/mil/tests/test_block.py
+++ b/coremltools/converters/mil/mil/tests/test_block.py
@@ -27,7 +27,6 @@
 the core API being tested here.
 """
 
-
 def test_empty_block():
     """
     Test an empty program
diff --git a/coremltools/converters/mil/mil/tests/test_programs.py b/coremltools/converters/mil/mil/tests/test_programs.py
index 4fbbd69a6..139bf4d62 100644
--- a/coremltools/converters/mil/mil/tests/test_programs.py
+++ b/coremltools/converters/mil/mil/tests/test_programs.py
@@ -9,7 +9,7 @@
 import coremltools as ct
 from coremltools import _logger as logger
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil import Function, Program, types
 from coremltools.converters.mil.mil.passes.tests.test_passes import CONSTEXPR_FUNCS
 
 np.random.seed(0)
@@ -199,8 +199,10 @@ def false_fn():
         return mb.cond(pred=mb.cast(x=pred, dtype="bool"), _true_fn=true_fn, _false_fn=false_fn)
     return prog
 
-class TestMLProgramVersionHandling:
-
+class TestMILProgramVersionHandling:
+    """
+    Test basic functionality of opset version handling in pymil
+    """
     @staticmethod
     def test_multi_versions_op_selection():
         '''
@@ -306,6 +308,110 @@ def test_bulid_non_compatible_program_early_error_out():
         with pytest.raises(ValueError, match=expected_err_str):
             get_simple_topk_pixel_unshuffle_program()
 
+class TestMILBuilderAPI:
+    """
+    Test the basic builder API.
+    """
+    def test_create_function(self):
+        """
+        Test mb.function API
+        """
+        @mb.function(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def func(x):
+            return mb.add(x=x, y=0.0)
+
+        assert isinstance(func, Function)
+        assert len(func.operations) == 2  # add, const
+        assert len(func.inputs) == 1
+        assert len(func.outputs) == 1
+
+    def test_create_program(self):
+        """
+        Test mb.program API
+        """
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            return mb.add(x=x, y=0.0)
+
+        assert isinstance(prog, Program)
+        func = prog.functions["main"]
+        assert len(func.operations) == 2  # add, const
+        assert len(func.inputs) == 1
+        assert len(func.outputs) == 1
+
+    def test_create_program_function_name(self):
+        """
+        If ``function_name`` is not provide, mb.program creates function with name "main" by default.
+        """
+        # defaults to "main"
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x0):
+            return x0
+
+        assert len(prog.functions) == 1
+        assert "main" in prog.functions
+
+        # user can also provide function_name
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))], function_name="good_function")
+        def prog(x0):
+            return x0
+
+        assert len(prog.functions) == 1
+        assert "good_function" in prog.functions
+
+    def test_program_with_multiple_functions(self):
+        """
+        Basic creation of a program with multiple functions
+        """
+        @mb.function(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def func_1(x):
+            return x
+
+        @mb.function(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def func_2(x):
+            return x
+
+        @mb.function(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def func_3(x):
+            return x
+
+        prog = Program()
+        prog.add_function("func_1", func_1)
+        prog.add_function("func_2", func_2)
+        prog.add_function("func_3", func_3)
+
+        assert set(prog.functions.keys()) == set(["func_1", "func_2", "func_3"])
+
+    def test_error_out_incompatible_functions(self):
+        """
+        ``add_function`` should error out when a function with different
+        opset is added to a program.
+        """
+        @mb.function(input_specs=[mb.TensorSpec(shape=(2, 4))], opset_version=ct.target.iOS13)
+        def func_1(x):
+            return x
+
+        @mb.function(input_specs=[mb.TensorSpec(shape=(2, 4))], opset_version=ct.target.iOS17)
+        def func_2(x):
+            return x
+
+        err_msg = "all functions must have the same opset_version."
+
+        prog = Program()
+        prog.add_function("func_1", func_1)
+        with pytest.raises(ValueError, match=err_msg):
+            prog.add_function("func_2", func_2)
+
+        prog = Program()
+        prog.add_function("func_2", func_2)
+        with pytest.raises(ValueError, match=err_msg):
+            prog.add_function("func_1", func_1)
+
+
+class TestMILBasic:
+    """
+    Test the basic error handling / validation in pymil.
+    """
     @staticmethod
     def test_type_domain_validation():
         '''
@@ -320,6 +426,49 @@ def prog(x):
                 res = mb.rsqrt(x=x, epsilon=1)
                 return res
 
+    @staticmethod
+    def test_get_dialect_namespaces():
+        """
+        Test we can get a dict of dialect namespaces in the program.
+        """
+        # The pymil program is mixed of torch / complex dialect opset
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 2, 3, 4), dtype=types.fp32)])
+        def prog(x):
+            real_data = mb.torch_upsample_nearest_neighbor(
+                x=x, output_height=10, output_width=5, name="op_1"
+            )
+            imag_data = mb.add(x=real_data, y=8.9, name="op_2")
+            return mb.complex(real_data=real_data, imag_data=imag_data, name="op_3")
+
+        dialect_namespaces = prog._get_dialect_namespaces()
+        assert len(dialect_namespaces["torch"]) == 1
+        assert dialect_namespaces["torch"][0].name == "op_1"
+        assert len(dialect_namespaces["complex"]) == 1
+        assert dialect_namespaces["complex"][0].name == "op_3"
+
+        # The pymil program with only core ops returns an empty dict
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 2, 3, 4), dtype=types.fp32)])
+        def prog(x):
+            return mb.add(x=x, y=8.9)
+
+        assert len(prog._get_dialect_namespaces()) == 0
+
+    @staticmethod
+    def test_invalid_dialect_namespaces_error_out():
+        """
+        The converter should early error out if dialect opset is detected in the pymil program.
+        """
+        # The pymil program of torch dialect opset cannot be lowered to backend
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 2, 3, 4), dtype=types.fp32)])
+        def prog(x):
+            return mb.torch_upsample_nearest_neighbor(
+                x=x, output_height=10, output_width=5, name="op_1"
+            )
+
+        expected_err_str = 'Core ML only support core opset. Got unsupported op "op_1" with type "torch_upsample_nearest_neighbor" of dialect namespace "torch".'
+        with pytest.raises(ValueError, match=expected_err_str):
+            ct.convert(prog, convert_to="mlprogram", pass_pipeline=ct.PassPipeline.EMPTY)
+
     @staticmethod
     def test_rank6_tensor_early_error_out():
         '''
diff --git a/coremltools/converters/mil/testing_utils.py b/coremltools/converters/mil/testing_utils.py
index 18f2d8865..1ed4c66c5 100644
--- a/coremltools/converters/mil/testing_utils.py
+++ b/coremltools/converters/mil/testing_utils.py
@@ -184,6 +184,13 @@ def assert_same_input_names(prog1, prog2, func_name="main"):
     assert prog1_input_names == prog2_input_names
 
 
+def assert_numerical_value(mil_var, expected_value):
+    if mil_var is None:
+        assert expected_value is None
+    else:
+        np.testing.assert_allclose(mil_var.val, expected_value)
+
+
 def assert_same_input_types(prog1, prog2, func_name="main"):
     prog1_input_types = [x.dtype for x in list(prog1[func_name].inputs.values())]
     prog2_input_types = [x.dtype for x in list(prog2[func_name].inputs.values())]
diff --git a/coremltools/models/__init__.py b/coremltools/models/__init__.py
index 56c47c3db..698652f07 100644
--- a/coremltools/models/__init__.py
+++ b/coremltools/models/__init__.py
@@ -31,6 +31,7 @@
     _QUANTIZATION_MODE_DEQUANTIZE,
     _METADATA_VERSION,
     _METADATA_SOURCE,
+    _METADATA_SOURCE_DIALECT,
 )
 
 from . import neural_network
diff --git a/coremltools/models/model.py b/coremltools/models/model.py
index e60e5f29a..9d409f530 100644
--- a/coremltools/models/model.py
+++ b/coremltools/models/model.py
@@ -93,7 +93,7 @@
 
 _METADATA_VERSION = "com.github.apple.coremltools.version"
 _METADATA_SOURCE = "com.github.apple.coremltools.source"
-
+_METADATA_SOURCE_DIALECT = "com.github.apple.coremltools.source_dialect"
 
 
 class _FeatureDescription:
diff --git a/coremltools/optimize/coreml/_config.py b/coremltools/optimize/coreml/_config.py
index edf8e2272..126c6ec1f 100644
--- a/coremltools/optimize/coreml/_config.py
+++ b/coremltools/optimize/coreml/_config.py
@@ -529,6 +529,7 @@ def lut_function(weight):
     weight_threshold: Optional[int] = field(default=2048, validator=validators.optional([validators.instance_of(int), _check_weight_threshold]))
 
     _WEIGHT_PALETTIZATION_MODES = ("KMEANS", "UNIFORM", "UNIQUE", "CUSTOM")
+    _VALID_NBITS = (1, 2, 4, 6, 8)
 
     @nbits.validator
     def check_nbits(self, attr, nbits):
@@ -540,9 +541,9 @@ def check_nbits(self, attr, nbits):
         if nbits is not None and mode in ("UNIQUE", "CUSTOM"):
             raise ValueError(f"\"nbits\" must NOT be provided for {self.mode} mode")
 
-        if nbits is not None and nbits not in [1, 2, 4, 6, 8]:
+        if nbits is not None and nbits not in self._VALID_NBITS:
             raise ValueError(
-                f"Invalid value of \"nbits\" ({nbits}) for palettization. Supported \"nbits\" are {{1, 2, 4, 6, 8}}"
+                f'Invalid value of "nbits" ({nbits}) for palettization. Supported "nbits" are {self._VALID_NBITS}'
             )
 
     @mode.validator
diff --git a/coremltools/optimize/coreml/_quantization_passes.py b/coremltools/optimize/coreml/_quantization_passes.py
index 79a1cc520..fba517d78 100644
--- a/coremltools/optimize/coreml/_quantization_passes.py
+++ b/coremltools/optimize/coreml/_quantization_passes.py
@@ -3,12 +3,14 @@
 # Use of this source code is governed by a BSD-3-clause license that can be
 # found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+from typing import Callable, Optional, Tuple
+
 import numpy as np
 from tqdm import tqdm
 
 from coremltools import _logger as logger
-from coremltools.converters.mil.backend.mil.load import should_use_weight_file
 from coremltools.converters.mil._deployment_compatibility import AvailableTarget
+from coremltools.converters.mil.backend.mil.load import should_use_weight_file
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import Operation, Program, types
 from coremltools.converters.mil.mil.block import is_current_opset_version_compatible_with
@@ -166,6 +168,31 @@ def get_supported_types_as_str(supported_type):
                 supported_type_str = get_supported_types_as_str(self._SUPPORTED_CONFIG_TYPE)
                 raise ValueError(f"{self.__class__.__name__} only accept {supported_type_str} type config. Got {config.__class__.__name__}.")
 
+    @staticmethod
+    def pick_channnel_axis(op: Operation) -> int:
+        """
+        By default, output channel is used as the channel axis. Here are some representative ops:
+        - linear: [D_out, D_in]
+        - matmul's y: [..., D_in, D_out] if transpose_y is False, else [..., D_out, D_in]
+        - conv: [C_out, C_in_div_group, KH, KW]
+        - conv_transpose: [C_in, C_out_div_group, KH, KW]
+
+        So the channel axis picking criterial is:
+        - For conv_transpose it's 1
+        - For matmul's y it's -1 (transpose_y=False) or -2 (transpose_y=True)
+        - For all other ops, it's 0
+        """
+        channel_axis = 0
+        var = op.outputs[0]
+        if len(var.child_ops) == 1:
+            child_op = var.child_ops[0]
+            if child_op.op_type == "conv_transpose":
+                channel_axis = 1
+            if child_op.op_type == "matmul" and child_op.y == var:
+                channel_axis = -1 if child_op.transpose_y else -2
+        return channel_axis
+
+
 @register_pass(namespace="compression")
 class prune_weights(AbstractCompressionPass):
     """
@@ -424,6 +451,7 @@ class palettize_weights(AbstractCompressionPass):
     - Old ``const`` op is replaced by a newly created operation.
     """
     _SUPPORTED_CONFIG_TYPE = OpPalettizerConfig
+    _SUPPORTED_NBITS = (1, 2, 4, 6, 8)
 
     def is_valid_op(self, op: Operation):
         if op.op_type == "const" and should_use_weight_file(op.outputs[0].val):
@@ -431,8 +459,19 @@ def is_valid_op(self, op: Operation):
         return False
 
     @staticmethod
-    def compress(val, mode, nbits=None, lut_function=None):
+    def _get_nbits_for_unique_mode(val: np.ndarray, allowed_nbits: Tuple[int, ...]) -> int:
+        val = val.flatten()
+        unique_vals = np.unique(val).tolist()
+        for nbits in allowed_nbits:
+            if len(unique_vals) <= 1 << nbits:
+                return nbits
+        raise ValueError("Unique values in weight cannot be represented by 8 bits palettization.")
 
+    @staticmethod
+    def _get_lut_and_indices(
+        val: np.ndarray, mode: str, nbits: Optional[int], lut_function: Optional[Callable]
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """Calculate look-up-table (LUT) and indices."""
         def compress_kmeans(val, nbits):
             lut, indices = _get_kmeans_lookup_table_and_weight(nbits, val)
             lut = lut.astype(val.dtype)
@@ -451,16 +490,6 @@ def compress_uniform(val, nbits):
             lut = lut.astype(val.dtype)
             return lut, indices
 
-        def get_nbits_for_unique_mode(val):
-            val = val.flatten()
-            unique_vals = np.unique(val).tolist()
-            for nbits in (1, 2, 4, 6, 8):
-                if len(unique_vals) <= 1 << nbits:
-                    return nbits
-            msg = "weight value cannot be represented in an 8 bits palettization. Skipped."
-            logger.warning(msg)
-            return None
-
         def compress_unique(val, nbits):
             val = val.flatten()
             unique_vals = np.unique(val).tolist()
@@ -483,6 +512,25 @@ def compress_unique(val, nbits):
             indices = indices.astype(np.uint8)
             return lut, indices
 
+        if mode == "KMEANS":
+            lut, indices = compress_kmeans(val, nbits)
+        elif mode == "UNIFORM":
+            lut, indices = compress_uniform(val, nbits)
+        elif mode == "UNIQUE":
+            if nbits is None:
+                nbits = palettize_weights._get_nbits_for_unique_mode(
+                    val, palettize_weights._SUPPORTED_NBITS
+                )
+            lut, indices = compress_unique(val, nbits)
+        else:
+            if mode != "CUSTOM":
+                raise AssertionError(f"Invalid mode {mode}")
+            lut, indices = lut_function(val)
+
+        return lut, indices
+
+    @staticmethod
+    def compress(val, mode, nbits=None, lut_function=None) -> LutParams:
         def check_lut_parameters_are_valid(val, lut, indices):
             if not isinstance(lut, np.ndarray) or not isinstance(indices, np.ndarray):
                 raise ValueError("LUT and indices must be type of numpy array.")
@@ -508,17 +556,7 @@ def check_lut_parameters_are_valid(val, lut, indices):
         if not isinstance(val, (np.ndarray, np.generic)):
             raise ValueError(f"Only numpy arrays are supported. Got {type(val)}")
 
-        if mode == "KMEANS":
-            lut, indices = compress_kmeans(val, nbits)
-        elif mode == "UNIFORM":
-            lut, indices = compress_uniform(val, nbits)
-        elif mode == "UNIQUE":
-            nbits = get_nbits_for_unique_mode(val)
-            if nbits is None:
-                return None
-            lut, indices = compress_unique(val, nbits)
-        elif mode == "CUSTOM":
-            lut, indices = lut_function(val)
+        lut, indices = palettize_weights._get_lut_and_indices(val, mode, nbits, lut_function)
 
         check_lut_parameters_are_valid(val, lut, indices)
 
@@ -541,6 +579,15 @@ def transform_op(self, op: Operation):
         if not self.need_compress_const(op, self.config._is_deprecated, op_config.weight_threshold):
             return
 
+        if op_config.mode == "UNIQUE":
+            try:
+                palettize_weights._get_nbits_for_unique_mode(
+                    op.outputs[0].val, self._SUPPORTED_NBITS
+                )
+            except ValueError as e:
+                logger.warning(f"Skip op {op.name} for palettization, because {e}")
+                return
+
         lut_params = self.compress(
             op.outputs[0].val,
             op_config.mode,
@@ -548,9 +595,6 @@ def transform_op(self, op: Operation):
             op_config.lut_function
         )
 
-        if lut_params is None:
-            return
-
         if not self.fake_compression:
             new_var = mb.constexpr_lut_to_dense(
                 indices=lut_params.indices,
@@ -591,46 +635,28 @@ class linear_quantize_weights(AbstractCompressionPass):
     - If ``fake_compression=True``, compressed value is decompressed and then encoded using the ``const`` op.
     """
     _SUPPORTED_CONFIG_TYPE = OpLinearQuantizerConfig
+    _MODE_DTYPE_TO_RANGE = {
+        (types.int8, "LINEAR"): (-128, 127),
+        (types.int8, "LINEAR_SYMMETRIC"): (-127, 127),
+        (types.uint8, "LINEAR"): (0, 255),
+        (types.uint8, "LINEAR_SYMMETRIC"): (0, 254),
+    }
 
     def is_valid_op(self, op: Operation):
         if op.op_type == "const" and should_use_weight_file(op.outputs[0].val):
             return True
         return False
 
-    @staticmethod
-    def _get_axis(op):
-        axis = 0
-        var = op.outputs[0]
-        if len(var.child_ops) == 1 and var.child_ops[0].op_type == "conv_transpose":
-            axis = 1
-        return axis
+    @classmethod
+    def _get_quantized_data(
+        cls, original_data: np.ndarray, axes: Tuple[int, ...], mode: str, dtype: type
+    ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray]]:
+        """Get quantized data along with metadata (scale, zero_point)."""
+        if not np.issubdtype(original_data.dtype, np.floating):
+            raise ValueError("Only floating numpy arrays are supported.")
 
-    @staticmethod
-    def compress(val, axis, mode, dtype):
-        def _ensure_numerical_range_and_cast(val, low, high, np_dtype):
-            '''
-            For some cases, the computed quantized data might exceed the data range.
-            For instance, after rounding and addition, we might get `128` for the int8 quantization.
-            This utility function ensures the val in the data range before doing the cast.
-            '''
-            val = np.minimum(val, high)
-            val = np.maximum(val, low)
-            return val.astype(np_dtype)
-
-        mode_dtype_to_range = {
-            (types.int8, "LINEAR"): (-128, 127),
-            (types.int8, "LINEAR_SYMMETRIC"): (-127, 127),
-            (types.uint8, "LINEAR"): (0, 255),
-            (types.uint8, "LINEAR_SYMMETRIC"): (0, 254),
-        }
-
-        if not isinstance(val, (np.ndarray, np.generic)):
-            raise ValueError("Only numpy arrays are supported")
-
-        params = AffineQuantParams()
-        axes = tuple([i for i in range(len(val.shape)) if i != axis])
-        val_min = np.amin(val, axis=axes, keepdims=True)
-        val_max = np.amax(val, axis=axes, keepdims=True)
+        val_min = np.amin(original_data, axis=axes, keepdims=True)
+        val_max = np.amax(original_data, axis=axes, keepdims=True)
 
         if mode == "LINEAR_SYMMETRIC":
             # For the linear_symmetric mode, the range is symmetrical to 0
@@ -643,39 +669,42 @@ def _ensure_numerical_range_and_cast(val, low, high, np_dtype):
             val_min = np.minimum(0.0, val_min)
             val_max = np.maximum(0.0, val_max)
 
-        q_val_min, q_val_max = mode_dtype_to_range[(dtype, mode)]
-
-        # Set the zero point to symmetric mode
+        q_val_min, q_val_max = cls._MODE_DTYPE_TO_RANGE[(dtype, mode)]
         np_dtype = nptype_from_builtin(dtype)
+        zero_point = None
         if mode == "LINEAR_SYMMETRIC":
-            if dtype == types.int8:
-                params.zero_point = (0 * np.ones(val_min.shape)).astype(np.int8)
-            else:
-                assert dtype == types.uint8
-                params.zero_point = (127 * np.ones(val_min.shape)).astype(np.uint8)
+            if dtype.is_unsigned():
+                zero_point_shift = q_val_max // 2
+                zero_point = zero_point_shift * np.ones(val_min.shape)
         else:
             assert mode == "LINEAR"
-            params.zero_point = (q_val_min * val_max - q_val_max * val_min) / (val_max - val_min)
-            params.zero_point = np.round(params.zero_point)
-            params.zero_point = _ensure_numerical_range_and_cast(params.zero_point, q_val_min, q_val_max, np_dtype)
-
-        # compute the params
-        params.scale = (val_max - val_min) / (q_val_max - q_val_min)
-        params.scale = params.scale.astype(val.dtype).squeeze()
-
-        params.quantized_data = np.round(
-            val * (q_val_max - q_val_min) / (val_max - val_min)
-        )
-        params.quantized_data = (params.quantized_data + params.zero_point)
-        params.quantized_data = _ensure_numerical_range_and_cast(params.quantized_data, q_val_min, q_val_max, np_dtype)
-
-        params.zero_point = params.zero_point.squeeze()
-        params.axis = axis
-
-        return params
+            zero_point = (q_val_min * val_max - q_val_max * val_min) / (val_max - val_min)
+            zero_point = np.round(zero_point)
+            zero_point = np.clip(zero_point, q_val_min, q_val_max)
+
+        scale = (val_max - val_min) / (q_val_max - q_val_min)
+        quantized_data = np.round(original_data / scale)
+        if zero_point is not None:
+            quantized_data += zero_point
+            zero_point = zero_point.squeeze().astype(np_dtype)
+        quantized_data = np.clip(quantized_data, q_val_min, q_val_max).astype(np_dtype)
+        scale = scale.astype(original_data.dtype).squeeze()
+
+        return quantized_data, scale, zero_point
+
+    @classmethod
+    def compress(cls, val: np.ndarray, axis: int, mode: str, dtype: type) -> AffineQuantParams:
+        if not isinstance(val, (np.ndarray, np.generic)):
+            raise ValueError("Only numpy arrays are supported")
+        axes = tuple([i for i in range(len(val.shape)) if i != axis])
+        quantized_data, scale, zero_point = cls._get_quantized_data(val, axes, mode, dtype)
+        if zero_point is None:
+            # The iOS16 constexpr_affine_dequantize op requires zero_point.
+            zero_point = np.zeros_like(scale).astype(quantized_data.dtype)
+        return AffineQuantParams(quantized_data, zero_point, scale, axis)
 
     @staticmethod
-    def decompress(params):
+    def decompress(params: AffineQuantParams) -> np.ndarray:
         if not isinstance(params, AffineQuantParams):
             raise ValueError("Invalid type of params")
         return constexpr_affine_dequantize.decompress(
@@ -689,7 +718,9 @@ def transform_op(self, op: Operation):
         if not self.need_compress_const(op, self.config._is_deprecated, op_config.weight_threshold):
             return
 
-        quant_params = self.compress(op.outputs[0].val, self._get_axis(op), op_config.mode, op_config.dtype)
+        quant_params = self.compress(
+            op.outputs[0].val, self.pick_channnel_axis(op), op_config.mode, op_config.dtype
+        )
 
         if not self.fake_compression:
             new_var = mb.constexpr_affine_dequantize(
diff --git a/coremltools/optimize/torch/pruning/magnitude_pruner.py b/coremltools/optimize/torch/pruning/magnitude_pruner.py
index d837fc635..6d68eb531 100644
--- a/coremltools/optimize/torch/pruning/magnitude_pruner.py
+++ b/coremltools/optimize/torch/pruning/magnitude_pruner.py
@@ -304,7 +304,7 @@ def __attrs_post_init__(self):
             if self.initial_sparsity is not None and self.initial_sparsity > 0.0:
                 raise ValueError(
                     f"Received initial_sparsity = {self.initial_sparsity} and "
-                    f"n_m_ratio = {self.n_m_ratio}. When n_m_ratio != None, the only allowed "
+                    f"n_m_ratio = {self.nm_ratio}. When n_m_ratio != None, the only allowed "
                     f"value of initial_sparsity is 0."
                 )
 
diff --git a/coremltools/test/ml_program/test_compression.py b/coremltools/test/ml_program/test_compression.py
index c08899458..7452c1710 100644
--- a/coremltools/test/ml_program/test_compression.py
+++ b/coremltools/test/ml_program/test_compression.py
@@ -3,20 +3,36 @@
 # Use of this source code is governed by a BSD-3-clause license that can be
 # found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+from typing import Optional
+
 import numpy as np
 import torch
 
 import coremltools as ct
+from coremltools.converters.mil.testing_utils import get_op_types_in_program
 from coremltools.models.ml_program.compression_utils import (
     affine_quantize_weights,
     decompress_weights,
     palettize_weights,
     sparsify_weights,
 )
-from coremltools.converters.mil.testing_utils import get_op_types_in_program
+from coremltools.optimize.coreml._config import OpCompressorConfig
+
 
+def get_test_model_and_data(
+    multi_layer: bool = False, quantize_config: Optional[OpCompressorConfig] = None
+):
+    """
+    Prepare test model and data.
+
+    :param multi_layer: If set, the test model will have multiple `nn.Conv2d` layers.
+    :param quantize_config: If set, the weights in the test model will be nbits quantization-friendly,
+        which means it will be first quantized according to the config, and then dequantized, so the
+        numerical error introduced during the quantization test will be minimum.
+    """
+    if quantize_config is not None and multi_layer:
+        raise AssertionError("Multi-layer model doesn't support pre_quantize_nbits.")
 
-def get_test_model_and_data(multi_layer=False):
     inputs = [ct.TensorType(name="data", shape=(1, 64, 10, 10))]
     torch_input_values = [torch.rand(*i.shape.to_list()) for i in inputs]
     coreml_input_values = {
@@ -37,6 +53,25 @@ def forward(self, x):
         model = Model().eval()
     else:
         model = torch.nn.Conv2d(in_channels=64, out_channels=32, kernel_size=2)
+        if quantize_config is not None:
+            # Manually change weight to make it quantization friendly.
+            nbits_range_max = 2 ** (quantize_config.nbits - 1) - 1
+            mode_to_range = {
+                "LINEAR": (-nbits_range_max - 1, nbits_range_max),
+                "LINEAR_SYMMETRIC": (-nbits_range_max, nbits_range_max),
+            }
+            q_val_min, q_val_max = mode_to_range[quantize_config.mode]
+            original_shape = model.weight.detach().numpy().shape
+            fake_scale = 2.0
+            quantize_friendly_weight = (
+                np.random.randint(low=q_val_min, high=q_val_max + 1, size=original_shape)
+                * fake_scale
+            )
+            with torch.no_grad():
+                model.weight = torch.nn.Parameter(
+                    torch.from_numpy(quantize_friendly_weight).float()
+                )
+        model = model.eval()
 
     return model, inputs, torch_input_values, coreml_input_values
 
diff --git a/coremltools/test/neural_network/test_numpy_nn_layers.py b/coremltools/test/neural_network/test_numpy_nn_layers.py
index 404d60f97..134a4a515 100644
--- a/coremltools/test/neural_network/test_numpy_nn_layers.py
+++ b/coremltools/test/neural_network/test_numpy_nn_layers.py
@@ -6400,6 +6400,7 @@ def _test_pool3d_single_case(
                 input_shape=shape[2:],
                 strides=stride,
             )
+            total_paddings = list(total_paddings)
             total_paddings.reverse()
             for p in total_paddings:
                 before = int(math.floor(float(p) / 2.0))
diff --git a/coremltools/test/neural_network/test_tf_numeric.py b/coremltools/test/neural_network/test_tf_numeric.py
index 3fcdca7e5..2899aca77 100644
--- a/coremltools/test/neural_network/test_tf_numeric.py
+++ b/coremltools/test/neural_network/test_tf_numeric.py
@@ -165,6 +165,11 @@ def test_data_reorganize_cpu_only(self):
         self.test_data_reorganize(cpu_only=True)
 
     def test_depthwise_conv(self, cpu_only=False):
+        if not cpu_only:
+            pytest.xfail(
+                "rdar://116060011: re-activate coremltools tests blocked by Core ML regressions"
+            )
+
         def get_coreml_model_depthwise(X, params, w):
             eval = True
             mlmodel = None
diff --git a/coremltools/test/optimize/coreml/test_post_training_quantization.py b/coremltools/test/optimize/coreml/test_post_training_quantization.py
index 7fb842bfb..3d0c17a57 100644
--- a/coremltools/test/optimize/coreml/test_post_training_quantization.py
+++ b/coremltools/test/optimize/coreml/test_post_training_quantization.py
@@ -98,10 +98,9 @@ def create_unique_weight(weight, nbits):
     size = weight.detach().numpy().size
 
     unique_number = 1 << nbits
-    weight = []
-    partition_len = size // unique_number + 1
-    for i in range(unique_number):
-        weight += [i] * (partition_len)
+    weight = list(range(unique_number))
+    if size > unique_number:
+        weight.extend([unique_number - 1] * (size - unique_number))
     weight = np.reshape(np.array(weight[:size]).astype(np.float32), shape)
     return weight
 
@@ -324,7 +323,7 @@ def test_weight_palettization_unique_case_2(self, caplog):
         # validate parameters
         # converter should warn the user that one weight is not compressed
         mlmodel_palettized = palettize_weights(mlmodel, mode="unique")
-        warning_msg = "weight value cannot be represented in an 8 bits palettization. Skipped."
+        warning_msg = "Unique values in weight cannot be represented by 8 bits palettization."
         assert any([warning_msg in rec.message for rec in caplog.records])
 
         expected_ops = ['constexpr_lut_to_dense', 'cast', 'conv', 'conv', 'cast']
diff --git a/coremltools/version.py b/coremltools/version.py
index 27e3a7666..c21924b8c 100644
--- a/coremltools/version.py
+++ b/coremltools/version.py
@@ -4,4 +4,4 @@
 # found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 
-__version__ = "7.0"  # VERSION_STRING
+__version__ = "7.1"  # VERSION_STRING
diff --git a/reqs/test.pip b/reqs/test.pip
index e90870439..784ce6769 100644
--- a/reqs/test.pip
+++ b/reqs/test.pip
@@ -24,9 +24,9 @@ scipy==1.9.2; python_version == '3.11'
 six
 sympy > 1.6
 gast==0.4.0
-torch==2.0.1
-torchaudio==2.0.2
-torchvision==0.15.2
+torch==2.1.0
+torchaudio==2.1.0
+torchvision==0.16.0
 xgboost==1.4.2; platform_machine != "arm64"
 mock
 wrapt
diff --git a/scripts/build.sh b/scripts/build.sh
index f74c15eab..43e1059b7 100755
--- a/scripts/build.sh
+++ b/scripts/build.sh
@@ -91,7 +91,7 @@ cd ${BUILD_DIR}
 ADDITIONAL_CMAKE_OPTIONS=""
 if [[ "$OSTYPE" == "darwin"* ]]; then
     NUM_PROCS=$(sysctl -n hw.ncpu)
-    ADDITIONAL_CMAKE_OPTIONS="-DCMAKE_OSX_DEPLOYMENT_TARGET=10.15"
+    ADDITIONAL_CMAKE_OPTIONS="-DCMAKE_OSX_DEPLOYMENT_TARGET=12.3"
 else
     NUM_PROCS=$(nproc)
 fi