diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
index ae795b12ab..4e1cb99cc0 100644
--- a/.ci/scripts/test_llama.sh
+++ b/.ci/scripts/test_llama.sh
@@ -130,9 +130,9 @@ cleanup_files() {
 prepare_artifacts_upload() {
   if [ -n "$UPLOAD_DIR" ]; then
     echo "Preparing for uploading generated artifacs"
+    zip -j model.zip "${EXPORTED_MODEL_NAME}" tokenizer.bin
     mkdir -p "${UPLOAD_DIR}"
-    zip -j "model.zip" "${MODEL_NAME}" tokenizer.bin
-    cp "model.zip" "${UPLOAD_DIR}"
+    mv model.zip "${UPLOAD_DIR}"
   fi
 }
 
diff --git a/.github/ghstack_direct b/.github/ghstack_direct
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index a8223eef2c..18f7f06d0b 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -30,13 +30,47 @@ on:
         description: The list of configs used the benchmark
         required: false
         type: string
+      test_spec:
+        description: The test spec to drive the test on AWS devices
+        required: false
+        type: string
+        default: https://ossci-android.s3.amazonaws.com/executorch/android-llm-device-farm-test-spec.yml
+  workflow_call:
+    inputs:
+      models:
+        description: Models to be benchmarked
+        required: false
+        type: string
+        default: stories110M
+      devices:
+        description: Target devices to run benchmark
+        required: false
+        type: string
+        default: samsung_galaxy_s2x
+      delegates:
+        description: Backend delegates
+        required: false
+        type: string
+        default: xnnpack
+      threadpool:
+        description: Run with threadpool?
+        required: false
+        type: boolean
+        default: false
+      benchmark_configs:
+        description: The list of configs used the benchmark
+        required: false
+        type: string
+      test_spec:
+        description: The test spec to drive the test on AWS devices
+        required: false
+        type: string
+        default: https://ossci-android.s3.amazonaws.com/executorch/android-llm-device-farm-test-spec.yml
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
-permissions: read-all
-
 jobs:
   set-parameters:
     runs-on: linux.2xlarge
@@ -48,11 +82,27 @@ jobs:
       - name: Set parameters
         id: set-parameters
         shell: bash
+        env:
+          # Separate default values from the workflow dispatch. To ensure defaults are accessible
+          # during scheduled runs and to provide flexibility for different defaults between
+          # on-demand and periodic benchmarking.
+          CRON_DEFAULT_MODELS: "stories110M"
+          CRON_DEFAULT_DEVICES: "samsung_galaxy_s2x"
+          CRON_DEFAULT_DELEGATES: "xnnpack"
         run: |
           set -ex
           MODELS="${{ inputs.models }}"
+          if [ -z "$MODELS" ]; then
+            MODELS="$CRON_DEFAULT_MODELS"
+          fi
           DEVICES="${{ inputs.devices }}"
+          if [ -z "$DEVICES" ]; then
+            DEVICES="$CRON_DEFAULT_DEVICES"
+          fi
           DELEGATES="${{ inputs.delegates }}"
+          if [ -z "$DELEGATES" ]; then
+            DELEGATES="$CRON_DEFAULT_DELEGATES"
+          fi
 
           # Mapping devices to their corresponding device-pool-arn
           declare -A DEVICE_POOL_ARNS
@@ -218,7 +268,6 @@ jobs:
       # TODO: Hard code llm_demo_bpe for now in this job.
       android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_bpe/app-debug.apk
       android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_bpe/app-debug-androidTest.apk
-      # The test spec can be downloaded from https://ossci-assets.s3.amazonaws.com/android-llama2-device-farm-test-spec.yml
-      test-spec: arn:aws:devicefarm:us-west-2:308535385114:upload:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/abd86868-fa63-467e-a5c7-218194665a77
+      test-spec: ${{ inputs.test_spec }}
       # Uploaded to S3 from the previous job
       extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/${{ matrix.model }}_${{ matrix.delegate }}/model.zip
diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml
index 7b3d8ab9a8..3c88d02db5 100644
--- a/.github/workflows/android.yml
+++ b/.github/workflows/android.yml
@@ -170,8 +170,7 @@ jobs:
       # Uploaded to S3 from the previous job, the name of the app comes from the project itself
       android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_${{ matrix.tokenizer }}/app-debug.apk
       android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_${{ matrix.tokenizer }}/app-debug-androidTest.apk
-      # The test spec can be downloaded from https://ossci-assets.s3.amazonaws.com/android-llama2-device-farm-test-spec.yml
-      test-spec: arn:aws:devicefarm:us-west-2:308535385114:upload:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/abd86868-fa63-467e-a5c7-218194665a77
+      test-spec: https://ossci-android.s3.amazonaws.com/executorch/android-llm-device-farm-test-spec.yml
       # Among the input, this is the biggest file, so it is cached on AWS to make the test faster. Note that the file is deleted by AWS after 30
       # days and the job will automatically re-upload the file when that happens.
       extra-data: https://ossci-assets.s3.amazonaws.com/executorch-android-llama2-7b-0717.zip
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 591a0328b7..5410c17ba7 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -205,27 +205,13 @@ jobs:
 
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
 
+        # install pybind
+        bash install_requirements.sh --pybind xnnpack
+
         # install Llava requirements
         bash examples/models/llama2/install_requirements.sh
         bash examples/models/llava/install_requirements.sh
 
-        # run export_llava.sh
-        python examples/models/llava/export_llava.py --use-sdpa-with-kv-cache --pte-name llava_custom_sdpa.pte
-
-        # verify file exists
-        if [ ! -f "llava_custom_sdpa.pte" ]; then
-            echo "llava_custom_sdpa.pte not found!"
-            exit 1
-        fi
-
-        python examples/models/llava/export_llava.py --no-use-sdpa-with-kv-cache --pte-name llava.pte
-
-        # verify file exists
-        if [ ! -f "llava.pte" ]; then
-            echo "llava.pte not found!"
-            exit 1
-        fi
-
         # run python unittest
         python -m unittest examples.models.llava.test.test_llava
 
@@ -337,7 +323,7 @@ jobs:
         size=${arr[4]}
         # threshold=48120 on devserver with gcc11.4
         # todo(lfq): update once binary size is below 50kb.
-        threshold="51768"
+        threshold="51784"
         if [[ "$size" -le "$threshold" ]]; then
           echo "Success $size <= $threshold"
         else
diff --git a/.github/workflows/upload-test-specs.yml b/.github/workflows/upload-test-specs.yml
new file mode 100644
index 0000000000..24119b6456
--- /dev/null
+++ b/.github/workflows/upload-test-specs.yml
@@ -0,0 +1,92 @@
+name: Upload AWS Device Farm test specs
+
+on:
+  pull_request:
+    paths:
+      - .github/workflows/upload-test-specs.yml
+      - examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml
+  push:
+    branches:
+      - main
+    paths:
+      - .github/workflows/upload-test-specs.yml
+      - examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  upload-android-test-spec-for-validation:
+    runs-on: linux.2xlarge
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Upload the spec as a GitHub artifact for validation
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/artifact
+          retention-days: 1
+          if-no-files-found: error
+          path: examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml
+
+  validate-android-test-spec:
+    needs: upload-android-test-spec-for-validation
+    uses: ./.github/workflows/android-perf.yml
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      # Just use a small model here with a minimal amount of configuration to test the spec
+      models: stories110M
+      devices: samsung_galaxy_s2x
+      delegates: xnnpack
+      test_spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/android-llm-device-farm-test-spec.yml
+
+  upload-android-test-spec:
+    needs: validate-android-test-spec
+    runs-on: ubuntu-22.04
+    timeout-minutes: 15
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+          cache: pip
+
+      - name: configure aws credentials
+        uses: aws-actions/configure-aws-credentials@v1.7.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_executorch_upload-frameworks-android
+          aws-region: us-east-1
+
+      - name: Only push to S3 when running the workflow manually from main branch
+        if: ${{ github.ref == 'refs/heads/main' }}
+        shell: bash
+        run: |
+          set -eux
+          echo "UPLOAD_ON_MAIN=1" >> "${GITHUB_ENV}"
+
+      - name: Upload the spec to S3 ossci-android bucket
+        shell: bash
+        working-directory: examples/demo-apps/android/LlamaDemo/
+        env:
+          SPEC_FILE: android-llm-device-farm-test-spec.yml
+        run: |
+          set -eux
+
+          pip install awscli==1.32.18
+
+          AWS_CMD="aws s3 cp --dryrun"
+          if [[ "${UPLOAD_ON_MAIN:-0}" == "1" ]]; then
+            AWS_CMD="aws s3 cp"
+          fi
+
+          shasum -a 256 "${SPEC_FILE}"
+          ${AWS_CMD} "${SPEC_FILE}" s3://ossci-android/executorch/ --acl public-read
diff --git a/.gitmodules b/.gitmodules
index 33324b17e2..0999bdb935 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -28,9 +28,6 @@
 [submodule "backends/xnnpack/third-party/pthreadpool"]
 	path = backends/xnnpack/third-party/pthreadpool
 	url = https://github.com/Maratyszcza/pthreadpool.git
-[submodule "examples/third-party/LLaVA"]
-	path = examples/third-party/LLaVA
-	url = https://github.com/haotian-liu/LLaVA.git
 [submodule "examples/third-party/fbjni"]
 	path = examples/third-party/fbjni
 	url = https://github.com/facebookincubator/fbjni.git
diff --git a/backends/apple/coreml/setup.md b/backends/apple/coreml/setup.md
index 4e66544f7b..0efd9bbcc2 100644
--- a/backends/apple/coreml/setup.md
+++ b/backends/apple/coreml/setup.md
@@ -50,7 +50,7 @@ xcode-select --install
 
 ```bash
 cd executorch
-./build/build_apple_frameworks.sh --Release --coreml
+./build/build_apple_frameworks.sh --coreml
 ```
 5. Open the project in Xcode, and drag `executorch.xcframework` and `coreml_backend.xcframework` frameworks generated from Step 2 to Frameworks.
 
diff --git a/backends/apple/mps/CMakeLists.txt b/backends/apple/mps/CMakeLists.txt
index f364319283..f47139a000 100644
--- a/backends/apple/mps/CMakeLists.txt
+++ b/backends/apple/mps/CMakeLists.txt
@@ -69,7 +69,7 @@ add_library(mpsdelegate ${_mps_backend__srcs})
 find_library(FOUNDATION_FRAMEWORK Foundation)
 find_library(METAL_FRAMEWORK Metal)
 find_library(MPS_FRAMEWORK MetalPerformanceShaders)
-find_library(MPS_GRAPG_FRAMEWORK MetalPerformanceShadersGraph)
+find_library(MPS_GRAPH_FRAMEWORK MetalPerformanceShadersGraph)
 
 target_link_libraries(
   mpsdelegate
@@ -79,7 +79,7 @@ target_link_libraries(
           ${FOUNDATION_FRAMEWORK}
           ${METAL_FRAMEWORK}
           ${MPS_FRAMEWORK}
-          ${MPS_GRAPG_FRAMEWORK}
+          ${MPS_GRAPH_FRAMEWORK}
 )
 
 target_link_options_shared_lib(mpsdelegate)
diff --git a/backends/apple/mps/mps_preprocess.py b/backends/apple/mps/mps_preprocess.py
index 519b4b31ec..f195897882 100644
--- a/backends/apple/mps/mps_preprocess.py
+++ b/backends/apple/mps/mps_preprocess.py
@@ -2,9 +2,8 @@
 #  Copyright (c) 2023 Apple Inc. All rights reserved.
 #  Provided subject to the LICENSE file in the top level directory.
 #
-
 import logging
-from typing import Dict, final, List
+from typing import ClassVar, Dict, final, List, Tuple
 
 import torch
 
@@ -16,6 +15,8 @@
 )
 
 from executorch.backends.apple.mps.serialization.mps_graph_schema import (
+    Buffer,
+    DataSegment,
     MPSGraph,
     MPSTensor,
     OpType,
@@ -25,6 +26,7 @@
     convert_to_flatbuffer,
 )
 from executorch.backends.apple.mps.utils.mps_utils import is_parameter
+from executorch.exir._serialize._program import Cord
 
 from executorch.exir.backend.backend_details import (
     BackendDetails,
@@ -39,6 +41,29 @@
 
 @final
 class MPSBackend(BackendDetails):
+    @staticmethod
+    def slice_len_max(s):
+        assert s.start is not None
+        assert s.stop is not None
+        step = 1
+        if s.step is not None:
+            step = s.step
+        return max((s.stop - s.start) // step, 1)
+
+    MAGIC_IX: ClassVar[slice] = slice(4, 8)
+    DATA_SEGMENT_OFFSET_IX: ClassVar[slice] = slice(8, 16)
+    DATA_SEGMENT_SIZE_IX: ClassVar[slice] = slice(16, 24)
+
+    # magic bytes that should be at the beginning of the header
+    EXPECTED_MAGIC: ClassVar[bytes] = b"MP00"
+    # The length of the header in bytes
+    EXPECTED_LENGTH: ClassVar[int] = (
+        4
+        + slice_len_max(MAGIC_IX)
+        + slice_len_max(DATA_SEGMENT_OFFSET_IX)
+        + slice_len_max(DATA_SEGMENT_SIZE_IX)
+    )
+
     @staticmethod
     def preprocess(
         edge_program: ExportedProgram,
@@ -67,6 +92,7 @@ def preprocess(
             output_ids=[],
             constant_ids=[],
             graph_type=OpType.mps_graph,
+            constant_segment=DataSegment(0, 0),
         )
 
         convert_model_to_fp16 = True
@@ -100,10 +126,44 @@ def preprocess(
             else:
                 op_handler[node.op](edge_program, node_visitors, node, mps_graph)
 
+        segment_data, mps_graph = _extract_constant_segment(mps_graph)
+
+        # Add to aggregate segments cord with padding.
+        padding_length = _padding_required(len(segment_data), 16)
+        if padding_length > 0:
+            segment_data.append(b"\x00" * padding_length)
+
+        # Combine mps_graph with segment data
+        combined = Cord()
+        graph_bytes = convert_to_flatbuffer(mps_graph)
+
+        data_segment_offset: int = MPSBackend.EXPECTED_LENGTH
+        data_segment_offset = data_segment_offset + len(graph_bytes)
+
+        graph_padding_length = _padding_required(data_segment_offset, 16)
+        data_segment_offset = data_segment_offset + graph_padding_length
+        data_segment_size = len(segment_data)
+
+        data: bytes = (
+            b"\x00\x00\x00\x00"
+            + MPSBackend.EXPECTED_MAGIC
+            + data_segment_offset.to_bytes(8, byteorder="little")
+            + data_segment_size.to_bytes(8, byteorder="little")
+        )
+        assert len(data) == MPSBackend.EXPECTED_LENGTH
+
+        combined.append(data)
+        combined.append(graph_bytes)
+
+        if graph_padding_length > 0:
+            combined.append(b"\x00" * graph_padding_length)
+        # Append the segment data to the end of the mps graph
+        combined.append(segment_data)
+
         if logging.DEBUG >= logging.root.level:
             pretty_print(mps_graph)
 
-        return PreprocessResult(processed_bytes=convert_to_flatbuffer(mps_graph))
+        return PreprocessResult(processed_bytes=bytes(combined))
 
     @staticmethod
     def handle_call_function(
@@ -164,12 +224,42 @@ def handle_get_attr(
         pass
 
 
+def _padding_required(offset: int, alignment: int) -> int:
+    """Returns the padding required to align `offset` to `alignment`."""
+    remainder: int = offset % alignment
+    if remainder != 0:
+        return alignment - remainder
+    return 0
+
+
+def _extract_constant_segment(mps_graph: MPSGraph) -> Tuple[Cord, MPSGraph]:
+    """Extracts the constant segment from the MPSGraph and returns the updated MPSGraph along with the segment data."""
+    # Note that the beginning of the segment data is not aligned. Need to handle out of this call.
+    segment_data = Cord()
+    offset = 0
+    for i in range(len(mps_graph.mps_values)):
+        tensor = mps_graph.mps_values[i]
+        if tensor.constant_buffer_size > 0:
+            # Notice that buffer is already force aligned so we don't need to pad it
+            segment_data.append(tensor.constant_buffer.storage)
+
+            # Reset buffer to empty
+            tensor.constant_buffer = Buffer(storage=b"")
+            # Update segment offset
+            tensor.segment_offset = offset
+            offset += tensor.constant_buffer_size
+
+    return segment_data, mps_graph
+
+
 def tensor_to_str(mps_tensor: MPSTensor):
     tensor_str = "MPSTensor("
     tensor_str += "datatype=" + str(mps_tensor.datatype) + ", "
     tensor_str += "num_dims=" + str(mps_tensor.num_dims) + ", "
     tensor_str += "dims=" + str(mps_tensor.dims) + ", "
-    tensor_str += "constant_buffer_size=" + str(mps_tensor.constant_buffer_size)
+    tensor_str += "constant_buffer=" + str(mps_tensor.constant_buffer) + ", "
+    tensor_str += "constant_buffer_size=" + str(mps_tensor.constant_buffer_size) + ", "
+    tensor_str += "segment_offset=" + str(mps_tensor.segment_offset)
     tensor_str += ")"
 
     return tensor_str
@@ -193,3 +283,4 @@ def pretty_print(mps_graph: MPSGraph):
     logging.info(" Output ids:")
     for out_id in mps_graph.output_ids:
         logging.info(f"   {out_id}")
+    logging.info(f" Constant segment: {mps_graph.constant_segment}")
diff --git a/backends/apple/mps/runtime/MPSCompiler.mm b/backends/apple/mps/runtime/MPSCompiler.mm
index 560c1bb0a1..9ac154d5d8 100644
--- a/backends/apple/mps/runtime/MPSCompiler.mm
+++ b/backends/apple/mps/runtime/MPSCompiler.mm
@@ -43,7 +43,7 @@
   Error err = Error::Ok;
 
   std::unique_ptr<MPSGraphBuilder> mpsGraphBuilder(
-    new MPSGraphBuilder(buffer_pointer, executor->_mpsGraphTensorToId));
+    new MPSGraphBuilder(buffer_pointer, num_bytes, executor->_mpsGraphTensorToId));
   err = mpsGraphBuilder->compileModel();
   ET_CHECK_OR_RETURN_ERROR(
     err == Error::Ok, Internal, "Failed to construct the MPS graph object");
diff --git a/backends/apple/mps/runtime/MPSDelegateHeader.h b/backends/apple/mps/runtime/MPSDelegateHeader.h
new file mode 100644
index 0000000000..07a138b918
--- /dev/null
+++ b/backends/apple/mps/runtime/MPSDelegateHeader.h
@@ -0,0 +1,113 @@
+//
+//  Copyright (c) 2024 Apple Inc. All rights reserved.
+//  Provided subject to the LICENSE file in the top level directory.
+//
+
+#pragma once
+
+#include <executorch/runtime/core/result.h>
+
+namespace torch {
+namespace executor {
+namespace mps {
+namespace delegate {
+
+/**
+ * MPS-header that is embedded before the flatbuffer payload
+ *
+ */
+struct MPSDelegateHeader {
+  /**
+   * The minimum size of the MPSDelegateHeader. The caller should provide at
+   * least this many bytes of the head of the serialized MPS Data
+   */
+  static constexpr size_t kMinSize = 30;
+
+  /**
+   * The magic offset. This offset is the same as the offset for flatbuffer
+   * header so we will be able to check if the header is is either the
+   * flatbuffer head or the wrapper header we introduce here
+   */
+  static constexpr size_t kMagicOffset = 4;
+
+  /**
+   * The magic bytes that identify the header.
+   *
+   * This is the canonical definition of the expected value. If the header
+   * layout ever changes in a compatibility-breaking way, increment the digits
+   * in the magic. But, doing so will prevent older binaries from recognizing
+   * the presence of the header. The compatibility-preserving way to make
+   * changes is to increase the header's length field and add new fields at the
+   * end.
+   */
+  static constexpr size_t kMagicSize = 4;
+  static constexpr char kMagic[kMagicSize] = {'M', 'P', '0', '0'};
+
+  /**
+   * The size in bytes of the header length. We store 2 bytes for the header
+   * length
+   */
+  static constexpr size_t kHeaderLengthSize = 2;
+
+  /**
+   * The expected location of the header length field relative to the beginning
+   * of the header.
+   */
+  static constexpr size_t kHeaderLengthOffset =
+      MPSDelegateHeader::kMagicOffset + MPSDelegateHeader::kMagicSize;
+
+  /*
+   * The expected location of the constant data offset field relative to the
+   * beginning of the header.
+   */
+  static constexpr size_t kConstantDataSegmentOffset = kHeaderLengthOffset;
+
+  /*
+   * The expected location of the constant data size field relative to the
+   * beginning of the header.
+   */
+  static constexpr size_t kConstantDataSizeOffset =
+      kConstantDataSegmentOffset + sizeof(uint64_t);
+
+  /**
+   * The expected location of the flatbuffer data offset field relative to the
+   * beginning of the header.
+   */
+  static constexpr size_t kFlatbufferDataOffsetOffset =
+      kConstantDataSizeOffset + sizeof(uint64_t);
+
+  /**
+   * Look for and parse an ExtendedHeader in the provided data.
+   *
+   * @param[in] data The contents of the beginning of the serialized binary
+   *     Program data, starting at offset 0 (i.e., the head of the file).
+   * @param[in] size Length of `data` in bytes.
+   *
+   * @returns an MPSHeader if the header was found and is valid. Returns an
+   *     error if size was too short, if the header was not found, or if the
+   *     header appeared to be corrupt.
+   */
+  static Result<MPSDelegateHeader> Parse(const void* data, size_t size);
+
+  /**
+   * The offset in bytes to the beginning of the constant data.
+   */
+  uint64_t constant_data_offset;
+  /**
+   * The size in bytes of the constant data.
+   */
+  uint64_t constant_data_size;
+  /**
+   * The offset in bytes to the beginning of the flatbuffer data.
+   */
+  uint64_t flatbuffer_offset;
+  /**
+   * The size in bytes of the flatbuffer data.
+   */
+  uint64_t flatbuffer_size;
+};
+
+} // namespace delegate
+} // namespace mps
+} // namespace executor
+} // namespace torch
diff --git a/backends/apple/mps/runtime/MPSDelegateHeader.mm b/backends/apple/mps/runtime/MPSDelegateHeader.mm
new file mode 100644
index 0000000000..2994b30507
--- /dev/null
+++ b/backends/apple/mps/runtime/MPSDelegateHeader.mm
@@ -0,0 +1,53 @@
+//
+//  Copyright (c) 2024 Apple Inc. All rights reserved.
+//  Provided subject to the LICENSE file in the top level directory.
+//
+
+#include <executorch/backends/apple/mps/runtime/MPSDelegateHeader.h>
+
+#include <cstring>
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+
+namespace torch {
+namespace executor {
+namespace mps {
+namespace delegate {
+
+/// Interprets the 8 bytes at `data` as a little-endian uint64_t.
+uint64_t getUInt64LE(const uint8_t* data) {
+  return (uint64_t)data[0] | ((uint64_t)data[1] << 8) |
+      ((uint64_t)data[2] << 16) | ((uint64_t)data[3] << 24) |
+      ((uint64_t)data[4] << 32) | ((uint64_t)data[5] << 40) |
+      ((uint64_t)data[6] << 48) | ((uint64_t)data[7] << 56);
+}
+
+Result<MPSDelegateHeader> MPSDelegateHeader::Parse(const void* data, size_t size) {
+  const uint8_t* header_data = (const uint8_t*)data;
+
+  if (size < MPSDelegateHeader::kMinSize) {
+    return Error::InvalidArgument;
+  }
+
+  const uint8_t* magic_start = header_data + MPSDelegateHeader::kMagicOffset;
+  if (std::memcmp(magic_start, MPSDelegateHeader::kMagic, MPSDelegateHeader::kMagicSize) != 0) {
+    return Error::NotFound;
+  }
+
+  uint64_t constant_data_offset = getUInt64LE(header_data + MPSDelegateHeader::kConstantDataSegmentOffset);
+  uint64_t constant_data_size = getUInt64LE(header_data + MPSDelegateHeader::kConstantDataSizeOffset);
+  uint64_t flatbuffer_offset = MPSDelegateHeader::kFlatbufferDataOffsetOffset;
+  uint64_t flatbuffer_size = size - flatbuffer_offset;
+
+  return MPSDelegateHeader{
+      constant_data_offset,
+      constant_data_size,
+      flatbuffer_offset,
+      flatbuffer_size};
+}
+
+} // namespace delegate
+} // namespace mps
+} // namespace executor
+} // namespace torch
diff --git a/backends/apple/mps/runtime/MPSGraphBuilder.h b/backends/apple/mps/runtime/MPSGraphBuilder.h
index 29b9471ae9..5ee1e32aa1 100644
--- a/backends/apple/mps/runtime/MPSGraphBuilder.h
+++ b/backends/apple/mps/runtime/MPSGraphBuilder.h
@@ -40,7 +40,8 @@ using NodePtr = const mpsgraph::MPSNode *;
  */
 class MPSGraphBuilder {
 public:
-  MPSGraphBuilder(const void *buffer_pointer, std::unordered_map<MPSGraphTensor *, int32_t> &mpsGraphTensorToId);
+  MPSGraphBuilder(const void *buffer_pointer, size_t num_bytes,
+                  std::unordered_map<MPSGraphTensor *, int32_t> &mpsGraphTensorToId);
   ~MPSGraphBuilder() = default;
 
   Error compileModel();
@@ -178,12 +179,15 @@ class MPSGraphBuilder {
   const mpsgraph::MPSGraph *_flatBufferGraph;
   // FlatBuffer raw bytes of the serialized MPS model.
   const void *_buffer_pointer;
+  size_t _num_bytes;
 
   bool _metal_kernel;
   MPSGraph *_mpsGraph;
   MPSGraphExecutable *_mpsGraphExecutable;
   NSMutableDictionary<MPSGraphTensor *, MPSGraphShapedType *> *_feeds;
   NSMutableArray<MPSGraphTensor *> *_targetTensors;
+
+  const uint8_t *_constant_data_ptr;
 };
 
 #undef _DEFINE_MPS_OP
diff --git a/backends/apple/mps/runtime/MPSGraphBuilder.mm b/backends/apple/mps/runtime/MPSGraphBuilder.mm
index 8b571001d4..a11cb638fb 100644
--- a/backends/apple/mps/runtime/MPSGraphBuilder.mm
+++ b/backends/apple/mps/runtime/MPSGraphBuilder.mm
@@ -5,13 +5,19 @@
 
 #include <executorch/backends/apple/mps/runtime/MPSGraphBuilder.h>
 #include <executorch/backends/apple/mps/runtime/MPSDevice.h>
+#include <executorch/backends/apple/mps/runtime/MPSDelegateHeader.h>
 
 namespace torch {
 namespace executor {
 namespace mps {
 namespace delegate {
 
-MPSGraphBuilder::MPSGraphBuilder(const void* buffer_pointer, std::unordered_map<MPSGraphTensor*, int32_t>& mpsGraphTensorToId) : _mpsGraphTensorToId(mpsGraphTensorToId), _buffer_pointer(buffer_pointer) {
+MPSGraphBuilder::MPSGraphBuilder(
+  const void* buffer_pointer,
+  size_t num_bytes,
+  std::unordered_map<MPSGraphTensor*, int32_t>& mpsGraphTensorToId) :
+    _mpsGraphTensorToId(mpsGraphTensorToId), _buffer_pointer(buffer_pointer), _num_bytes(num_bytes) {
+
   _mpsGraph = [MPSGraph new];
   _feeds = [NSMutableDictionary dictionary];
   _targetTensors = [NSMutableArray new];
@@ -24,15 +30,36 @@
 MPSGraphBuilder::compileModel() {
   Error err = Error::Ok;
 
-  ET_CHECK(_buffer_pointer != nullptr);
+  Result<MPSDelegateHeader> header = MPSDelegateHeader::Parse(_buffer_pointer, _num_bytes);
+  const uint8_t* flatbuffer_data_ptr = nullptr;
+
+  if (header.ok()) {
+    flatbuffer_data_ptr = reinterpret_cast<const uint8_t*>(_buffer_pointer) +
+        header->flatbuffer_offset;
+    _constant_data_ptr = reinterpret_cast<const uint8_t*>(_buffer_pointer) +
+        header->constant_data_offset;
+  } else if (header.error() == Error::NotFound) {
+    ET_LOG(
+        Error,
+        "MPSDelegateHeader version mismatch: '%.4s' != expected '%.4s'",
+        // Header Magic and FlatbufferIdentifier are same offset and size
+        flatbuffers::GetBufferIdentifier(_buffer_pointer),
+        MPSDelegateHeader::kMagic);
+    return header.error();
+  } else {
+    ET_LOG(Error, "MPSDelegateHeader may be corrupt");
+    return header.error();
+  }
+
+  ET_CHECK(flatbuffer_data_ptr != nullptr);
   ET_CHECK_OR_RETURN_ERROR(
-    mpsgraph::MPSGraphBufferHasIdentifier(_buffer_pointer),
+    mpsgraph::MPSGraphBufferHasIdentifier(flatbuffer_data_ptr),
     DelegateInvalidCompatibility,
     "MPS Delegate Serialization Format version identifier '%.4s' != expected '%.4s'",
-    flatbuffers::GetBufferIdentifier(_buffer_pointer),
+    flatbuffers::GetBufferIdentifier(flatbuffer_data_ptr),
     mpsgraph::MPSGraphIdentifier());
 
-  _flatBufferGraph = mpsgraph::GetMPSGraph(_buffer_pointer);
+  _flatBufferGraph = mpsgraph::GetMPSGraph(flatbuffer_data_ptr);
   switch (_flatBufferGraph->graph_type()) {
     case mpsgraph::OpType::metal_kernel:
     {
diff --git a/backends/apple/mps/runtime/operations/OperationUtils.mm b/backends/apple/mps/runtime/operations/OperationUtils.mm
index 21c4a0d3e7..27e4f170bd 100644
--- a/backends/apple/mps/runtime/operations/OperationUtils.mm
+++ b/backends/apple/mps/runtime/operations/OperationUtils.mm
@@ -88,10 +88,11 @@
 NSData*
 MPSGraphBuilder::getConstantData(int32_t id) {
   TensorPtr mpsTensor = _flatBufferGraph->mps_values()->Get(id);
-  int32_t constantBufferSize = mpsTensor->constant_buffer_size();
-  const unsigned char* constantBuffer = mpsTensor->constant_buffer()->storage()->data();
+  uint64_t constantBufferSize = mpsTensor->constant_buffer_size();
+  uint64_t segmentOffset = mpsTensor->segment_offset();
+  const unsigned char* constantBuffer = _constant_data_ptr + segmentOffset;
   ET_CHECK_MSG(constantBufferSize > 0 && constantBuffer != nullptr, "[ERROR] Invalid constant buffer");
-  return [[NSData alloc] initWithBytes:constantBuffer
+  return [[NSData alloc] initWithBytesNoCopy:(void*)constantBuffer
                                 length:constantBufferSize];
 }
 
diff --git a/backends/apple/mps/serialization/mps_graph_schema.py b/backends/apple/mps/serialization/mps_graph_schema.py
index 6909926e8c..c6eddd80ab 100644
--- a/backends/apple/mps/serialization/mps_graph_schema.py
+++ b/backends/apple/mps/serialization/mps_graph_schema.py
@@ -763,7 +763,14 @@ class MPSTensor:
     num_dims: int
     dims: List[int]
     constant_buffer_size: int
-    constant_buffer: Buffer
+    constant_buffer: Buffer  # deprecated
+    segment_offset: int = 0
+
+
+@dataclass
+class DataSegment:
+    offset: int
+    size: int
 
 
 @dataclass
@@ -775,3 +782,4 @@ class MPSGraph:
     output_ids: List[int]
     constant_ids: List[int]
     graph_type: OpType
+    constant_segment: DataSegment
diff --git a/backends/apple/mps/serialization/schema.fbs b/backends/apple/mps/serialization/schema.fbs
index 6e089d4526..cc767178ae 100644
--- a/backends/apple/mps/serialization/schema.fbs
+++ b/backends/apple/mps/serialization/schema.fbs
@@ -450,6 +450,7 @@ table MPSNode {
 
 // taken from executorch
 // Data buffer abstraction.
+// Deprecated
 table Buffer {
   storage:[ubyte] (force_align: 16);
 }
@@ -458,8 +459,21 @@ table MPSTensor {
   datatype:MPSDataType;
   num_dims:int;
   dims:[int];
-  constant_buffer_size:int;
-  constant_buffer:Buffer;
+  constant_buffer_size:uint64;
+  constant_buffer:Buffer; // deprecated
+  segment_offset:uint64;
+}
+
+table DataSegment {
+  // Segment offsets are relative to the segment base offset provided in
+  // the extended file header. Segments will typically be aligned in a
+  // way to make it possible to use mmap() to load them.
+  offset: uint64;
+
+  // The size in bytes of valid data starting at the offset. The segment
+  // data may be followed by padding before the segment that follows it,
+  // to make it easier to use mmap().
+  size: uint64;
 }
 
 table MPSGraph {
@@ -473,6 +487,8 @@ table MPSGraph {
   constant_ids:[int];
 
   graph_type:OpType;
+
+  constant_segment:DataSegment;
 }
 
 root_type MPSGraph;
diff --git a/backends/arm/arm_partitioner.py b/backends/arm/arm_partitioner.py
index 56dac5d248..353cb76c11 100644
--- a/backends/arm/arm_partitioner.py
+++ b/backends/arm/arm_partitioner.py
@@ -38,6 +38,7 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
         supported = node.op == "call_function" and node.target in [
             exir_ops.edge.aten.add.Tensor,
             exir_ops.edge.aten.addmm.default,
+            exir_ops.edge.aten.expand_copy.default,
             exir_ops.edge.aten.permute_copy.default,
             exir_ops.edge.aten.hardtanh.default,
             exir_ops.edge.aten.convolution.default,
@@ -46,6 +47,7 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             exir_ops.edge.aten._native_batch_norm_legit_no_training.default,
             exir_ops.edge.aten.avg_pool2d.default,
             exir_ops.edge.aten.sigmoid.default,
+            exir_ops.edge.aten.repeat.default,
             exir_ops.edge.aten._softmax.default,
             exir_ops.edge.aten.slice_copy.Tensor,
             exir_ops.edge.aten.sub.Tensor,
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
index e868b584cf..266f1720a9 100644
--- a/backends/arm/operators/__init__.py
+++ b/backends/arm/operators/__init__.py
@@ -18,6 +18,7 @@
     op_mean_dim,
     op_permute,
     op_quant,
+    op_repeat,
     op_sigmoid,
     op_slice,
     op_softmax,
diff --git a/backends/arm/operators/op_addmm.py b/backends/arm/operators/op_addmm.py
index 444799d353..4a0581376c 100644
--- a/backends/arm/operators/op_addmm.py
+++ b/backends/arm/operators/op_addmm.py
@@ -12,10 +12,7 @@
     register_node_visitor,
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
-from executorch.backends.arm.tosa_quant_utils import (
-    compute_multiplier_and_shift,
-    get_quant_node_args,
-)
+from executorch.backends.arm.tosa_quant_utils import build_rescale, get_quant_node_args
 
 from executorch.backends.arm.tosa_utils import build_reshape
 from executorch.exir.dialects._ops import ops as exir_ops
@@ -128,32 +125,20 @@ def define_node(
             weight_scale = get_quant_node_args(weight_node_q_node).scale
 
             output_rescale_scale = (input_scale * weight_scale) / consumer_node_scale
-            (
-                multiplier_output,
-                shift_output,
-            ) = compute_multiplier_and_shift(output_rescale_scale)
-
-            attr_rescale_output = ts.TosaSerializerAttribute()
-            attr_rescale_output.RescaleAttribute(
-                input_zp=0,
-                output_zp=consumer_node_node_zp,
-                multiplier=[multiplier_output],
-                shift=[shift_output],
-                scale32=True,
-                double_round=True,
-                per_channel=False,
-                input_unsigned=False,
-                output_unsigned=False,
-            )
 
             reshaped_res = tosa_graph.addIntermediate(result_shape, ts.DType.INT32)
             build_reshape(tosa_graph, conv2d_res.name, result_shape, reshaped_res.name)
 
-            tosa_graph.addOperator(
-                TosaOp.Op().RESCALE,
-                [reshaped_res.name],
-                [output.name],
-                attr_rescale_output,
+            build_rescale(
+                tosa_fb=tosa_graph,
+                scale=output_rescale_scale,
+                input_node=reshaped_res,
+                output_name=output.name,
+                output_type=ts.DType.INT8,
+                output_shape=reshaped_res.shape,
+                input_zp=0,
+                output_zp=consumer_node_node_zp,
+                is_double_round=False,
             )
 
         else:
diff --git a/backends/arm/operators/op_repeat.py b/backends/arm/operators/op_repeat.py
new file mode 100644
index 0000000000..261fcca12e
--- /dev/null
+++ b/backends/arm/operators/op_repeat.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import serializer.tosa_serializer as ts
+import torch
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+from executorch.backends.arm.tosa_utils import tosa_shape
+from serializer.tosa_serializer import TosaOp
+
+
+@register_node_visitor
+class RepeatVisitor(NodeVisitor):
+    target = "aten.repeat.default"
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: list[TosaArg],
+        output: TosaArg,
+        is_quant_node: bool,
+    ) -> None:
+
+        item_name = inputs[0].name
+        shape = inputs[0].shape
+        rank = len(shape)
+        multiples = inputs[1].special
+        new_rank = len(multiples)
+
+        assert new_rank >= rank
+
+        # TILE only supports rank(in) == rank(out). To add more dims, we need a reshape first.
+        if new_rank > rank:
+            # Add length 1 dimensions to shape to match multiples
+            num_new_dims = new_rank - rank
+            expanded_shape = tuple(
+                1 if i < num_new_dims else shape[i - num_new_dims]
+                for i in range(new_rank)
+            )
+            expanded_shape = tosa_shape(expanded_shape, output.dim_order)
+            dtype = (
+                ts.dtype_str_to_val("INT8")
+                if is_quant_node
+                else ts.dtype_str_to_val("FP32")
+            )
+
+            rescale_out = tosa_graph.addIntermediate(expanded_shape, dtype)
+            rescale_attr = ts.TosaSerializerAttribute()
+            rescale_attr.ReshapeAttribute(expanded_shape)
+            tosa_graph.addOperator(
+                TosaOp.Op().RESHAPE, [item_name], [rescale_out.name], rescale_attr
+            )
+            item_name = rescale_out.name
+
+        attr = ts.TosaSerializerAttribute()
+        attr.TileAttribute(tosa_shape(multiples, output.dim_order))
+        tosa_graph.addOperator(TosaOp.Op().TILE, [item_name], [output.name], attr)
diff --git a/backends/arm/passes/arm_pass_manager.py b/backends/arm/passes/arm_pass_manager.py
index c2453f701f..123146a325 100644
--- a/backends/arm/passes/arm_pass_manager.py
+++ b/backends/arm/passes/arm_pass_manager.py
@@ -9,6 +9,9 @@
 from executorch.backends.arm.passes.annotate_channels_last_dim_order_pass import (
     AnnotateChannelsLastDimOrder,
 )
+from executorch.backends.arm.passes.convert_expand_copy_to_repeat import (
+    ConvertExpandCopyToRepeatPass,
+)
 from executorch.backends.arm.passes.remove_clone_pass import RemoveClonePass
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.pass_manager import PassManager
@@ -24,6 +27,7 @@ def transform_to_backend_pipeline(
     ):
         """Apply passes before transforming program to backend"""
         self.add_pass(RemoveClonePass())
+        self.add_pass(ConvertExpandCopyToRepeatPass())
         for spec in compile_spec:
             if spec.key == "permute_memory_format":
                 memory_format = spec.value.decode()
diff --git a/backends/arm/passes/convert_expand_copy_to_repeat.py b/backends/arm/passes/convert_expand_copy_to_repeat.py
new file mode 100644
index 0000000000..53138682d5
--- /dev/null
+++ b/backends/arm/passes/convert_expand_copy_to_repeat.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.fx
+from executorch.backends.arm.tosa_mapping import extract_tensor_meta
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
+
+
+class ConvertExpandCopyToRepeatPass(ExportPass):
+    """
+    Replace expand copy with repeat since it is a repeat that can only repeat singleton dimensions.
+    """
+
+    expand_copy = exir_ops.edge.aten.expand_copy.default
+    repeat = exir_ops.edge.aten.repeat.default
+    patterns = [{expand_copy: 1}]
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        partitions = get_source_partitions(
+            graph, [torch.expand_copy, torch.Tensor.expand, "expand"]
+        )
+        for _, src_partitions in partitions.items():
+            for src_partition in src_partitions:
+                assert len(src_partition.nodes) == 1
+
+                expand_node = src_partition.nodes[0]
+                _, shape, _ = extract_tensor_meta(expand_node.all_input_nodes[0].meta)
+                multiples = expand_node.args[1]
+                expanded_rank = len(multiples)
+
+                # Expanded shape is 'shape' front-padded with ones.
+                padding = expanded_rank - len(shape)
+                extended_shape = [
+                    shape[i] if i >= 0 else 1 for i in range(-padding, len(shape))
+                ]
+
+                # To convert expand arg to repeat arg, non-repeated dims should have
+                # multiples[dim] = 1.
+                multiples = [
+                    multiples[i] if extended_shape[i] == 1 else 1
+                    for i in range(expanded_rank)
+                ]
+                args = (expand_node.args[0], multiples)
+
+                with graph_module.graph.inserting_before(expand_node):
+                    repeat_node = graph.create_node("call_function", self.repeat, args)
+                    repeat_node.meta = expand_node.meta
+                    for user in expand_node.users.copy():
+                        user.replace_input_with(expand_node, repeat_node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py
index 397ba68565..7a4c7712ab 100644
--- a/backends/arm/quantizer/arm_quantizer.py
+++ b/backends/arm/quantizer/arm_quantizer.py
@@ -385,7 +385,7 @@ def _annotate_io(
         for node in model.graph.nodes:
             if arm_quantizer_utils.is_annotated(node):
                 continue
-            if node.op == "placeholder":
+            if node.op == "placeholder" and len(node.users) > 0:
                 _annotate_output_qspec(
                     node,
                     quantization_config.get_output_act_qspec(),
diff --git a/backends/arm/test/ops/test_expand.py b/backends/arm/test/ops/test_expand.py
new file mode 100644
index 0000000000..66c081a544
--- /dev/null
+++ b/backends/arm/test/ops/test_expand.py
@@ -0,0 +1,109 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#
+# Tests the expand op which copies the data of the input tensor (possibly with new data format)
+#
+
+import unittest
+from typing import Sequence, Tuple
+
+import torch
+
+from executorch.backends.arm.quantizer.arm_quantizer import (
+    ArmQuantizer,
+    get_symmetric_quantization_config,
+)
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+
+from executorch.backends.xnnpack.test.tester.tester import Quantize
+from parameterized import parameterized
+
+
+class TestSimpleExpand(unittest.TestCase):
+    """Tests the Tensor.expand which should be converted to a repeat op by a pass."""
+
+    class Expand(torch.nn.Module):
+        # (input tensor, multiples)
+        test_parameters = [
+            (torch.ones(1), (2,)),
+            (torch.ones(1, 4), (1, -1)),
+            (torch.ones(1, 1, 2, 2), (4, 3, -1, 2)),
+            (torch.ones(1), (2, 2, 4)),
+            (torch.ones(3, 2, 4, 1), (-1, -1, -1, 3)),
+        ]
+
+        def forward(self, x: torch.Tensor, multiples: Sequence):
+            return x.expand(multiples)
+
+    def _test_expand_tosa_MI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .export()
+            .check_count({"torch.ops.aten.expand.default": 1})
+            .to_edge()
+            .partition()
+            .check_not(["torch.ops.aten.expand.default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_expand_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
+        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
+            .export()
+            .check_count({"torch.ops.aten.expand.default": 1})
+            .to_edge()
+            .partition()
+            .check_not(["torch.ops.aten.expand.default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
+        )
+
+    def _test_expand_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple):
+        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_u55_compile_spec(),
+            )
+            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
+            .export()
+            .check_count({"torch.ops.aten.expand.default": 1})
+            .to_edge()
+            .partition()
+            .check_not(["torch.ops.aten.expand.default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+        )
+
+    @parameterized.expand(Expand.test_parameters)
+    def test_expand_tosa_MI(self, test_input, multiples):
+        self._test_expand_tosa_MI_pipeline(self.Expand(), (test_input, multiples))
+
+    @parameterized.expand(Expand.test_parameters)
+    def test_expand_tosa_BI(self, test_input, multiples):
+        self._test_expand_tosa_BI_pipeline(self.Expand(), (test_input, multiples))
+
+    # Expected failure since tosa.TILE is unsupported by Vela.
+    @parameterized.expand(Expand.test_parameters)
+    @unittest.expectedFailure
+    def test_expand_u55_BI(self, test_input, multiples):
+        self._test_expand_tosa_u55_pipeline(self.Expand(), (test_input, multiples))
diff --git a/backends/arm/test/ops/test_repeat.py b/backends/arm/test/ops/test_repeat.py
new file mode 100644
index 0000000000..a6fad03345
--- /dev/null
+++ b/backends/arm/test/ops/test_repeat.py
@@ -0,0 +1,110 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#
+# Tests the repeat op which copies the data of the input tensor (possibly with new data format)
+#
+
+import unittest
+from typing import Sequence, Tuple
+
+import torch
+
+from executorch.backends.arm.quantizer.arm_quantizer import (
+    ArmQuantizer,
+    get_symmetric_quantization_config,
+)
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+
+from executorch.backends.xnnpack.test.tester.tester import Quantize
+from parameterized import parameterized
+
+
+class TestSimpleRepeat(unittest.TestCase):
+    """Tests Tensor.repeat for different ranks and dimensions."""
+
+    class Repeat(torch.nn.Module):
+        # (input tensor, multiples)
+        test_parameters = [
+            (torch.randn(3), (2,)),
+            (torch.randn(3, 4), (2, 1)),
+            (torch.randn(1, 1, 2, 2), (1, 2, 3, 4)),
+            (torch.randn(3), (2, 2)),
+            (torch.randn(3), (1, 2, 3)),
+            (torch.randn((3, 3)), (2, 2, 2)),
+        ]
+
+        def forward(self, x: torch.Tensor, multiples: Sequence):
+            return x.repeat(multiples)
+
+    def _test_repeat_tosa_MI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .export()
+            .check_count({"torch.ops.aten.repeat.default": 1})
+            .to_edge()
+            .partition()
+            .check_not(["torch.ops.aten.repeat.default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_repeat_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
+        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
+            .export()
+            .check_count({"torch.ops.aten.repeat.default": 1})
+            .to_edge()
+            .partition()
+            .check_not(["torch.ops.aten.repeat.default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
+        )
+
+    def _test_repeat_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple):
+        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_u55_compile_spec(),
+            )
+            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
+            .export()
+            .check_count({"torch.ops.aten.repeat.default": 1})
+            .to_edge()
+            .partition()
+            .check_not(["torch.ops.aten.repeat.default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+        )
+
+    @parameterized.expand(Repeat.test_parameters)
+    def test_repeat_tosa_MI(self, test_input, multiples):
+        self._test_repeat_tosa_MI_pipeline(self.Repeat(), (test_input, multiples))
+
+    @parameterized.expand(Repeat.test_parameters)
+    def test_repeat_tosa_BI(self, test_input, multiples):
+        self._test_repeat_tosa_BI_pipeline(self.Repeat(), (test_input, multiples))
+
+    # Expected failure since tosa.TILE is unsupported by Vela.
+    @parameterized.expand(Repeat.test_parameters)
+    @unittest.expectedFailure
+    def test_repeat_u55_BI(self, test_input, multiples):
+        self._test_repeat_tosa_u55_pipeline(self.Repeat(), (test_input, multiples))
diff --git a/backends/arm/tosa_quant_utils.py b/backends/arm/tosa_quant_utils.py
index 55649f4bef..c0d16d51b2 100644
--- a/backends/arm/tosa_quant_utils.py
+++ b/backends/arm/tosa_quant_utils.py
@@ -171,7 +171,7 @@ def build_rescale(
     output_shape,
     input_zp,
     output_zp,
-    is_double_round,
+    is_double_round=False,
 ):
     scale_width = 32 if is_scale32(output_type) else 16
     multiplier, shift = compute_multiplier_and_shift(scale, scale_width)
@@ -197,7 +197,7 @@ def build_rescale(
 
 
 def build_rescale_to_int32(
-    tosa_fb, input, input_zp, rescale_scale, is_scale32=True, is_double_round=True
+    tosa_fb, input, input_zp, rescale_scale, is_scale32=True, is_double_round=False
 ) -> TosaSerializerTensor:
     multiplier, shift = compute_multiplier_and_shift(rescale_scale)
     attr_rescale = ts.TosaSerializerAttribute()
@@ -230,7 +230,7 @@ def build_rescale_from_int32(
     output_zp,
     rescale_scale,
     is_scale32=True,
-    is_double_round=True,
+    is_double_round=False,
 ) -> TosaSerializerTensor:
     multiplier, shift = compute_multiplier_and_shift(rescale_scale)
     attr_rescale_output = ts.TosaSerializerAttribute()
@@ -329,9 +329,6 @@ def build_rescale_conv_output(
     output_scale,
     output_zp,
 ):
-    # Only use double round if we are doing 32 bit scaling
-    double_round = is_scale32(output_type)
-
     # TODO add check to verify if this is a Per-channel quantization.
     post_conv2d_scale = (input_scale.number * weight_scale.number) / output_scale.number
 
@@ -345,6 +342,5 @@ def build_rescale_conv_output(
         op.shape,
         0,
         output_zp.number,
-        double_round,
     )
     return
diff --git a/backends/cadence/CMakeLists.txt b/backends/cadence/CMakeLists.txt
index 5ad0192d92..f725655e0d 100644
--- a/backends/cadence/CMakeLists.txt
+++ b/backends/cadence/CMakeLists.txt
@@ -20,6 +20,7 @@ if(NOT EXECUTORCH_ROOT)
 endif()
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
 
 # Let files say "include <executorch/path/to/header.h>".
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
@@ -27,8 +28,56 @@ set(TARGET_DIR reference)
 
 if(EXECUTORCH_NNLIB_OPT)
 set(TARGET_DIR hifi)
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib)  
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib)
 endif()
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+if(NOT PYTHON_EXECUTABLE)
+  resolve_python_executable()
+endif()
+
+set(_common_compile_options -Wno-deprecated-declarations -fPIC)
+
+# Find prebuilt libraries. executorch package should contain portable_ops_lib,
+# etdump, bundled_program.
+find_package(executorch CONFIG REQUIRED)
+target_link_options_shared_lib(executorch)
+target_link_options_shared_lib(portable_ops_lib)
+
+target_include_directories(executorch INTERFACE ${_common_include_directories})
+
+find_package(
+  gflags REQUIRED PATHS ${CMAKE_CURRENT_BINARY_DIR}/../../third-party
+)
+
+add_executable(cadence_runner cadence_runner/cadence_runner.cpp)
+target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
 
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/operators)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)
+
+target_include_directories(
+  etdump INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../sdk/include
+                   ${EXECUTORCH_ROOT}/third-party/flatcc/include
+)
+
+target_include_directories(
+  cadence_runner PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
+                                    ${_common_include_directories}
+)
+
+target_link_libraries(
+  cadence_runner
+  executorch
+  gflags
+  etdump
+  extension_data_loader
+  bundled_program
+  cadence_ops_lib
+  flatccrt
+)
diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS
index 79646c1293..d077169022 100644
--- a/backends/cadence/aot/TARGETS
+++ b/backends/cadence/aot/TARGETS
@@ -4,7 +4,13 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+load("@fbcode_macros//build_defs:export_files.bzl", "export_file")
 load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
+load(
+    "@fbsource//tools/build_defs:default_platform_defs.bzl",
+    "CXX",
+)
+load("@fbsource//xplat/executorch/codegen:codegen.bzl", "executorch_generated_lib")
 
 oncall("odai_jarvis")
 
@@ -53,3 +59,19 @@ python_library(
         "//executorch/exir/passes:spec_prop_pass",
     ],
 )
+
+export_file(name = "functions.yaml")
+
+executorch_generated_lib(
+    name = "cadence_aot_lib",
+    custom_ops_yaml_target = "//executorch/kernels/portable:custom_ops.yaml",
+    functions_yaml_target = ":functions.yaml",
+    platforms = CXX,
+    visibility = ["PUBLIC"],
+    deps = [
+        "//executorch/backends/cadence/reference/kernels:cadence_kernels",
+        "//executorch/backends/cadence/reference/operators:cadence_cpu_ops",
+        "//executorch/kernels/portable:executorch_all_ops",
+        "//executorch/kernels/portable:operators",
+    ],
+)
diff --git a/backends/cadence/aot/export_example.py b/backends/cadence/aot/export_example.py
index c0e1727ec9..f7920f0b8f 100644
--- a/backends/cadence/aot/export_example.py
+++ b/backends/cadence/aot/export_example.py
@@ -7,17 +7,21 @@
 # Example script for exporting simple models to flatbuffer
 
 import logging
+import tempfile
 
 from executorch.backends.cadence.aot.ops_registrations import *  # noqa
-
 import os
 from typing import Any, Tuple
 
 from executorch.backends.cadence.aot.compiler import (
+    convert_pt2,
     export_to_cadence,
     export_to_edge,
     quantize_pt2,
 )
+from executorch.backends.cadence.aot.quantizer.quantizer import CadenceQuantizer
+from executorch.backends.cadence.runtime import runtime
+from executorch.backends.cadence.runtime.executor import BundledProgramManager
 from executorch.exir import ExecutorchProgramManager
 from torch import nn
 
@@ -44,23 +48,50 @@ def _save_pte_program(
         logging.error(f"Error while saving to {filename}: {e}")
 
 
+def _save_bpte_program(
+    buffer: bytes,
+    model_name: str,
+    output_dir: str = "",
+) -> None:
+    if model_name.endswith(".bpte"):
+        filename = model_name
+    else:
+        filename = os.path.join(output_dir, f"{model_name}.bpte")
+    try:
+        with open(filename, "wb") as f:
+            f.write(buffer)
+        logging.info(f"Saved exported program to {filename}")
+    except Exception as e:
+        logging.error(f"Error while saving to {output_dir}: {e}")
+
+
 def export_model(
     model: nn.Module,
     example_inputs: Tuple[Any, ...],
     file_name: str = "CadenceDemoModel",
 ):
+    # create work directory for outputs and model binary
+    working_dir = tempfile.mkdtemp(dir="/tmp")
+    logging.debug(f"Created work directory {working_dir}")
+
+    # convert the model (also called in quantize_pt2)
+    converted_model = convert_pt2(model, example_inputs, CadenceQuantizer())
+
+    # Get reference outputs from quantized_model
+    ref_outputs = converted_model(*example_inputs)
+
     # Quantize the model
     quantized_model = quantize_pt2(model, example_inputs)
 
-    # Get edge program
+    # Get edge program (also called in export_to_cadence)
     edge_prog_manager = export_to_edge(quantized_model, example_inputs)
 
     # Get edge program after Cadence specific passes
     cadence_prog_manager = export_to_cadence(quantized_model, example_inputs)
 
-    exec_prog = cadence_prog_manager.to_executorch()
+    exec_prog: ExecutorchProgramManager = cadence_prog_manager.to_executorch()
 
-    logging.info("Final exported graph:")
+    logging.info("Final exported graph:\n")
     exec_prog.exported_program().graph_module.graph.print_tabular()
 
     # Print some information to terminal
@@ -69,5 +100,28 @@ def export_model(
         cadence_prog_manager.exported_program().graph_module,
     )
 
-    # Save the program as (default name is CadenceDemoModel.pte)
-    _save_pte_program(exec_prog, file_name)
+    forward_test_data = BundledProgramManager.bundled_program_test_data_gen(
+        method="forward", inputs=example_inputs, expected_outputs=ref_outputs
+    )
+    bundled_program_manager = BundledProgramManager([forward_test_data])
+    buffer = bundled_program_manager._serialize(
+        exec_prog,
+        bundled_program_manager.get_method_test_suites(),
+        forward_test_data,
+    )
+    # Save the program as pte (default name is CadenceDemoModel.pte)
+    _save_pte_program(exec_prog, file_name, working_dir)
+    # Save the program as btpe (default name is CadenceDemoModel.bpte)
+    _save_bpte_program(buffer, file_name, working_dir)
+
+    logging.debug(
+        f"Executorch bundled program buffer saved to {file_name} is {len(buffer)} total bytes"
+    )
+
+    # TODO: move to test infra
+    runtime.run_and_compare(
+        executorch_prog=exec_prog,
+        inputs=example_inputs,
+        ref_outputs=ref_outputs,
+        working_dir=working_dir,
+    )
diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml
index b31bb20549..e42c818dc1 100644
--- a/backends/cadence/aot/functions.yaml
+++ b/backends/cadence/aot/functions.yaml
@@ -150,7 +150,7 @@
     - arg_meta: null
       kernel_name: impl::reference::quantized_relu_out
 
-func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
       kernel_name: impl::reference::quantized_matmul_out
diff --git a/backends/cadence/build_cadence_runner.sh b/backends/cadence/build_cadence_runner.sh
new file mode 100755
index 0000000000..51f363f8de
--- /dev/null
+++ b/backends/cadence/build_cadence_runner.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Builds cadence_runner and prints its path.
+
+set -euo pipefail
+
+SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+readonly SCRIPT_DIR
+
+readonly EXECUTORCH_ROOT="${SCRIPT_DIR}/../.."
+
+# Allow overriding the number of build jobs. Default to 9.
+export CMAKE_BUILD_PARALLEL_LEVEL="${CMAKE_BUILD_PARALLEL_LEVEL:-9}"
+
+main() {
+  cd "${EXECUTORCH_ROOT}"
+
+  rm -rf cmake-out
+  cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_SDK=ON \
+    -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
+    -DPYTHON_EXECUTABLE=python3 \
+    -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+    -DEXECUTORCH_BUILD_HOST_TARGETS=ON \
+    -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
+    -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
+    -DEXECUTORCH_BUILD_CPUINFO=OFF \
+    -DEXECUTORCH_ENABLE_LOGGING=ON \
+    -Bcmake-out .
+  cmake --build cmake-out --target install --config Release
+
+  local example_dir=backends/cadence
+  local build_dir="cmake-out/${example_dir}"
+  local cmake_prefix_path="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
+  rm -rf ${build_dir}
+  cmake -DCMAKE_PREFIX_PATH="${cmake_prefix_path}" \
+    -DCMAKE_BUILD_TYPE=Release \
+    -B"${build_dir}" \
+    "${example_dir}"
+  cmake --build "${build_dir}" --config Release
+
+  local runner="${PWD}/${build_dir}/cadence_runner"
+  if [[ ! -f "${runner}" ]]; then
+    echo "ERROR: Failed to build ${build_dir}/cadence_runner" >&2
+    exit 1
+  else
+    echo "Built ${build_dir}/cadence_runner"
+  fi
+}
+
+main "$@"
diff --git a/backends/cadence/cadence_runner/TARGETS b/backends/cadence/cadence_runner/TARGETS
new file mode 100644
index 0000000000..21f36a9bae
--- /dev/null
+++ b/backends/cadence/cadence_runner/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("odai_jarvis")
+
+define_common_targets()
diff --git a/backends/cadence/cadence_runner/cadence_runner.cpp b/backends/cadence/cadence_runner/cadence_runner.cpp
new file mode 100644
index 0000000000..d76ba004aa
--- /dev/null
+++ b/backends/cadence/cadence_runner/cadence_runner.cpp
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ *
+ * This tool can run ExecuTorch model files that only use operators that
+ * are covered by the portable kernels, with possible delegate to the
+ * test_backend_compiler_lib.
+ *
+ * It sets all input tensor data to ones, and assumes that the outputs are
+ * all fp32 tensors.
+ */
+
+#include <fstream>
+#include <memory>
+
+#include <gflags/gflags.h>
+
+#include <executorch/extension/data_loader/buffer_data_loader.h>
+#include <executorch/runtime/executor/method.h>
+#include <executorch/runtime/executor/program.h>
+#include <executorch/runtime/platform/log.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <executorch/sdk/bundled_program/bundled_program.h>
+#include <executorch/sdk/etdump/etdump_flatcc.h>
+
+static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4MB
+
+DEFINE_string(
+    bundled_program_path,
+    "CadenceDemoModel.bpte",
+    "Model serialized in flatbuffer format.");
+
+DEFINE_int32(
+    testset_idx,
+    0,
+    "Index of bundled verification set to be run "
+    "by bundled model for verification");
+
+DEFINE_string(
+    etdump_path,
+    "etdump.etdp",
+    "If etdump generation is enabled an etdump will be written out to this path");
+
+DEFINE_bool(
+    output_verification,
+    false,
+    "Comapre the model output to the reference outputs present in the BundledProgram.");
+
+DEFINE_bool(
+    print_output,
+    false,
+    "Print the output of the ET model to stdout, if needs.");
+
+DEFINE_bool(dump_outputs, true, "Dump outputs to etdump file");
+
+DEFINE_bool(
+    dump_intermediate_outputs,
+    false,
+    "Dump intermediate outputs to etdump file.");
+
+DEFINE_string(
+    debug_output_path,
+    "debug_output.bin",
+    "Path to dump debug outputs to.");
+
+DEFINE_int32(
+    debug_buffer_size,
+    262144, // 256 KB
+    "Size of the debug buffer in bytes to allocate for intermediate outputs and program outputs logging.");
+
+using namespace torch::executor;
+
+std::vector<uint8_t> load_file_or_die(const char* path) {
+  std::ifstream file(path, std::ios::binary | std::ios::ate);
+  const size_t nbytes = file.tellg();
+  file.seekg(0, std::ios::beg);
+  auto file_data = std::vector<uint8_t>(nbytes);
+  ET_CHECK_MSG(
+      file.read(reinterpret_cast<char*>(file_data.data()), nbytes),
+      "Could not load contents of file '%s'",
+      path);
+  return file_data;
+}
+
+int main(int argc, char** argv) {
+  runtime_init();
+
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  if (argc != 1) {
+    std::string msg = "Extra commandline args:";
+    for (int i = 1 /* skip argv[0] (program name) */; i < argc; i++) {
+      msg += std::string(" ") + argv[i];
+    }
+    ET_LOG(Error, "%s", msg.c_str());
+    return 1;
+  }
+
+  // Read in the entire file.
+  const char* bundled_program_path = FLAGS_bundled_program_path.c_str();
+  std::vector<uint8_t> file_data = load_file_or_die(bundled_program_path);
+
+  // Find the offset to the embedded Program.
+  const void* program_data;
+  size_t program_data_len;
+  Error status = torch::executor::bundled_program::GetProgramData(
+      reinterpret_cast<void*>(file_data.data()),
+      file_data.size(),
+      &program_data,
+      &program_data_len);
+  ET_CHECK_MSG(
+      status == Error::Ok,
+      "GetProgramData() failed on file '%s': 0x%x",
+      bundled_program_path,
+      (unsigned int)status);
+
+  auto buffer_data_loader =
+      util::BufferDataLoader(program_data, program_data_len);
+
+  // Parse the program file. This is immutable, and can also be reused
+  // between multiple execution invocations across multiple threads.
+  Result<Program> program = Program::load(&buffer_data_loader);
+  if (!program.ok()) {
+    ET_LOG(Error, "Failed to parse model file %s", bundled_program_path);
+    return 1;
+  }
+  ET_LOG(Info, "Model file %s is loaded.", bundled_program_path);
+
+  // Use the first method in the program.
+  const char* method_name = nullptr;
+  {
+    const auto method_name_result = program->get_method_name(0);
+    ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
+    method_name = *method_name_result;
+  }
+  ET_LOG(Info, "Running method %s", method_name);
+
+  // MethodMeta describes the memory requirements of the method.
+  Result<MethodMeta> method_meta = program->method_meta(method_name);
+  ET_CHECK_MSG(
+      method_meta.ok(),
+      "Failed to get method_meta for %s: 0x%x",
+      method_name,
+      (unsigned int)method_meta.error());
+
+  //
+  // The runtime does not use malloc/new; it allocates all memory using the
+  // MemoryManger provided by the client. Clients are responsible for allocating
+  // the memory ahead of time, or providing MemoryAllocator subclasses that can
+  // do it dynamically.
+  //
+
+  // The method allocator is used to allocate all dynamic C++ metadata/objects
+  // used to represent the loaded method. This allocator is only used during
+  // loading a method of the program, which will return an error if there was
+  // not enough memory.
+  //
+  // The amount of memory required depends on the loaded method and the runtime
+  // code itself. The amount of memory here is usually determined by running the
+  // method and seeing how much memory is actually used, though it's possible to
+  // subclass MemoryAllocator so that it calls malloc() under the hood (see
+  // MallocMemoryAllocator).
+  //
+  // In this example we use a statically allocated memory pool.
+  MemoryAllocator method_allocator{
+      MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)};
+
+  // The memory-planned buffers will back the mutable tensors used by the
+  // method. The sizes of these buffers were determined ahead of time during the
+  // memory-planning pasees.
+  //
+  // Each buffer typically corresponds to a different hardware memory bank. Most
+  // mobile environments will only have a single buffer. Some embedded
+  // environments may have more than one for, e.g., slow/large DRAM and
+  // fast/small SRAM, or for memory associated with particular cores.
+  std::vector<std::unique_ptr<uint8_t[]>> planned_buffers; // Owns the memory
+  std::vector<Span<uint8_t>> planned_spans; // Passed to the allocator
+  size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
+  for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
+    // .get() will always succeed because id < num_memory_planned_buffers.
+    size_t buffer_size =
+        static_cast<size_t>(method_meta->memory_planned_buffer_size(id).get());
+    ET_LOG(Info, "Setting up planned buffer %zu, size %zu.", id, buffer_size);
+    planned_buffers.push_back(std::make_unique<uint8_t[]>(buffer_size));
+    planned_spans.push_back({planned_buffers.back().get(), buffer_size});
+  }
+  HierarchicalAllocator planned_memory(
+      {planned_spans.data(), planned_spans.size()});
+
+  // Assemble all of the allocators into the MemoryManager that the Executor
+  // will use.
+  MemoryManager memory_manager(&method_allocator, &planned_memory);
+
+  //
+  // Load the method from the program, using the provided allocators. Running
+  // the method can mutate the memory-planned buffers, so the method should only
+  // be used by a single thread at at time, but it can be reused.
+  //
+  torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
+  Result<Method> method =
+      program->load_method(method_name, &memory_manager, &etdump_gen);
+  ET_CHECK_MSG(
+      method.ok(),
+      "Loading of method %s failed with status 0x%" PRIx32,
+      method_name,
+      method.error());
+  ET_LOG(Info, "Method loaded.");
+
+  void* debug_buffer = malloc(FLAGS_debug_buffer_size);
+  if (FLAGS_dump_intermediate_outputs) {
+    Span<uint8_t> buffer((uint8_t*)debug_buffer, FLAGS_debug_buffer_size);
+    etdump_gen.set_debug_buffer(buffer);
+    etdump_gen.set_event_tracer_debug_level(
+        EventTracerDebugLogLevel::kIntermediateOutputs);
+  } else if (FLAGS_dump_outputs) {
+    Span<uint8_t> buffer((uint8_t*)debug_buffer, FLAGS_debug_buffer_size);
+    etdump_gen.set_debug_buffer(buffer);
+    etdump_gen.set_event_tracer_debug_level(
+        EventTracerDebugLogLevel::kProgramOutputs);
+  }
+  // Use the inputs embedded in the bundled program.
+  status = torch::executor::bundled_program::LoadBundledInput(
+      *method, file_data.data(), FLAGS_testset_idx);
+  ET_CHECK_MSG(
+      status == Error::Ok,
+      "LoadBundledInput failed with status 0x%" PRIx32,
+      status);
+
+  ET_LOG(Info, "Inputs prepared.");
+
+  // Run the model.
+  status = method->execute();
+  ET_CHECK_MSG(
+      status == Error::Ok,
+      "Execution of method %s failed with status 0x%" PRIx32,
+      method_name,
+      status);
+  ET_LOG(Info, "Model executed successfully.");
+
+  // Print the outputs.
+  if (FLAGS_print_output) {
+    std::vector<EValue> outputs(method->outputs_size());
+    status = method->get_outputs(outputs.data(), outputs.size());
+    ET_CHECK(status == Error::Ok);
+    for (EValue& output : outputs) {
+      // TODO(T159700776): This assumes that all outputs are fp32 tensors. Add
+      // support for other EValues and Tensor dtypes, and print tensors in a
+      // more readable way.
+      auto output_tensor = output.toTensor();
+      auto data_output = output_tensor.const_data_ptr<float>();
+      for (size_t j = 0; j < output_tensor.numel(); ++j) {
+        ET_LOG(Info, "%f", data_output[j]);
+      }
+    }
+  }
+
+  // Dump the etdump data containing profiling/debugging data to the specified
+  // file.
+  etdump_result result = etdump_gen.get_etdump_data();
+  if (result.buf != nullptr && result.size > 0) {
+    FILE* f = fopen(FLAGS_etdump_path.c_str(), "w+");
+    fwrite((uint8_t*)result.buf, 1, result.size, f);
+    fclose(f);
+    free(result.buf);
+  }
+
+  if (FLAGS_output_verification) {
+    // Verify the outputs.
+    status =
+        torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput(
+            *method,
+            file_data.data(),
+            FLAGS_testset_idx,
+            1e-3, // rtol
+            1e-5 // atol
+        );
+    ET_CHECK_MSG(
+        status == Error::Ok,
+        "Bundle verification failed with status 0x%" PRIx32,
+        status);
+    ET_LOG(Info, "Model verified successfully.");
+  }
+
+  if (FLAGS_dump_outputs || FLAGS_dump_intermediate_outputs) {
+    FILE* f = fopen(FLAGS_debug_output_path.c_str(), "w+");
+    fwrite((uint8_t*)debug_buffer, 1, FLAGS_debug_buffer_size, f);
+    fclose(f);
+  }
+  free(debug_buffer);
+
+  return 0;
+}
diff --git a/backends/cadence/cadence_runner/targets.bzl b/backends/cadence/cadence_runner/targets.bzl
new file mode 100644
index 0000000000..028ff7ad2e
--- /dev/null
+++ b/backends/cadence/cadence_runner/targets.bzl
@@ -0,0 +1,30 @@
+load("@fbsource//tools/build_defs:fb_native_wrapper.bzl", "fb_native")
+load("@fbsource//tools/build_defs:fb_xplat_cxx_binary.bzl", "fb_xplat_cxx_binary")
+load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX")
+
+def define_common_targets():
+    fb_native.export_file(
+        name = "cadence_runner.cpp",
+        src = "cadence_runner.cpp",
+        visibility = [
+            "PUBLIC",
+        ],
+    )
+
+    fb_xplat_cxx_binary(
+        name = "cadence_runner",
+        srcs = ["cadence_runner.cpp"],
+        headers = [],
+        platforms = CXX,
+        visibility = ["PUBLIC"],
+        deps = [
+            "fbsource//arvr/third-party/gflags:gflags",
+            "fbsource//xplat/executorch/kernels/portable:generated_lib",
+            "fbsource//xplat/executorch/runtime/executor:program",
+            "fbsource//xplat/executorch/extension/data_loader:file_data_loader",
+            "fbsource//xplat/executorch/extension/data_loader:buffer_data_loader",
+            "fbsource//xplat/executorch/util:util",
+            "fbsource//xplat/executorch/sdk/etdump:etdump_flatcc",
+            "fbsource//xplat/executorch/sdk/bundled_program:runtime",
+        ],
+    )
diff --git a/backends/cadence/hifi/kernels/kernels.cpp b/backends/cadence/hifi/kernels/kernels.cpp
index 47a5c1cfc0..5a2d58d2e2 100644
--- a/backends/cadence/hifi/kernels/kernels.cpp
+++ b/backends/cadence/hifi/kernels/kernels.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "kernels.h"
+#include <executorch/backends/cadence/reference/kernels/kernels.h>
 #include "xa_nnlib_common.h"
 #include "xa_nnlib_common_macros.h"
 
diff --git a/backends/cadence/hifi/operators/dequantize_per_tensor.cpp b/backends/cadence/hifi/operators/dequantize_per_tensor.cpp
index 8a296307ee..0067f6510d 100644
--- a/backends/cadence/hifi/operators/dequantize_per_tensor.cpp
+++ b/backends/cadence/hifi/operators/dequantize_per_tensor.cpp
@@ -6,8 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/cadence/reference/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include "kernels.h"
 
 namespace impl {
 namespace HiFi {
diff --git a/backends/cadence/hifi/operators/quantize_per_tensor.cpp b/backends/cadence/hifi/operators/quantize_per_tensor.cpp
index aea6c1b943..bc0d315f3d 100644
--- a/backends/cadence/hifi/operators/quantize_per_tensor.cpp
+++ b/backends/cadence/hifi/operators/quantize_per_tensor.cpp
@@ -6,8 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/cadence/reference/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include "kernels.h"
 
 namespace impl {
 namespace HiFi {
diff --git a/backends/cadence/hifi/operators/quantized_layer_norm.cpp b/backends/cadence/hifi/operators/quantized_layer_norm.cpp
index 27d86e5622..034e5b2884 100644
--- a/backends/cadence/hifi/operators/quantized_layer_norm.cpp
+++ b/backends/cadence/hifi/operators/quantized_layer_norm.cpp
@@ -6,8 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/cadence/reference/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include "kernels.h"
 
 #include <algorithm>
 #include <cmath>
diff --git a/backends/cadence/hifi/operators/quantized_linear_out.cpp b/backends/cadence/hifi/operators/quantized_linear_out.cpp
index 2fdd900008..ddba4df17c 100644
--- a/backends/cadence/hifi/operators/quantized_linear_out.cpp
+++ b/backends/cadence/hifi/operators/quantized_linear_out.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "kernels.h"
+#include <executorch/backends/cadence/reference/kernels/kernels.h>
 
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <algorithm>
diff --git a/backends/cadence/reference/kernels/TARGETS b/backends/cadence/reference/kernels/TARGETS
new file mode 100644
index 0000000000..67f2bab681
--- /dev/null
+++ b/backends/cadence/reference/kernels/TARGETS
@@ -0,0 +1,5 @@
+load("targets.bzl", "define_common_targets")
+
+oncall("odai_jarvis")
+
+define_common_targets()
diff --git a/backends/cadence/reference/kernels/kernels.cpp b/backends/cadence/reference/kernels/kernels.cpp
index 735d390bc7..ae3e1bad2d 100644
--- a/backends/cadence/reference/kernels/kernels.cpp
+++ b/backends/cadence/reference/kernels/kernels.cpp
@@ -6,10 +6,10 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "kernels.h"
-
+#include <math.h>
 #include <algorithm>
-#include <limits>
+#include <cstring>
+#include <numeric>
 
 namespace impl {
 namespace reference {
diff --git a/backends/cadence/reference/kernels/targets.bzl b/backends/cadence/reference/kernels/targets.bzl
new file mode 100644
index 0000000000..d3fe3fa39d
--- /dev/null
+++ b/backends/cadence/reference/kernels/targets.bzl
@@ -0,0 +1,15 @@
+load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    runtime.cxx_library(
+        name = "cadence_kernels",
+        srcs = ["kernels.cpp"],
+        exported_headers = [
+            "kernels.h",
+        ],
+        visibility = [
+            "//executorch/backends/cadence/...",
+        ],
+        platforms = CXX,
+    )
diff --git a/backends/cadence/reference/operators/TARGETS b/backends/cadence/reference/operators/TARGETS
new file mode 100644
index 0000000000..67f2bab681
--- /dev/null
+++ b/backends/cadence/reference/operators/TARGETS
@@ -0,0 +1,5 @@
+load("targets.bzl", "define_common_targets")
+
+oncall("odai_jarvis")
+
+define_common_targets()
diff --git a/backends/cadence/reference/operators/dequantize_per_tensor.cpp b/backends/cadence/reference/operators/dequantize_per_tensor.cpp
index 4d6a618034..29323ce612 100644
--- a/backends/cadence/reference/operators/dequantize_per_tensor.cpp
+++ b/backends/cadence/reference/operators/dequantize_per_tensor.cpp
@@ -6,8 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/cadence/reference/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include "kernels.h"
 
 namespace impl {
 namespace reference {
diff --git a/backends/cadence/reference/operators/op_add.cpp b/backends/cadence/reference/operators/op_add.cpp
index 946a1ee858..3a8a388717 100644
--- a/backends/cadence/reference/operators/op_add.cpp
+++ b/backends/cadence/reference/operators/op_add.cpp
@@ -8,7 +8,6 @@
 
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
-#include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
diff --git a/backends/cadence/reference/operators/quantize_per_tensor.cpp b/backends/cadence/reference/operators/quantize_per_tensor.cpp
index 8e25b58a07..c2e53cda88 100644
--- a/backends/cadence/reference/operators/quantize_per_tensor.cpp
+++ b/backends/cadence/reference/operators/quantize_per_tensor.cpp
@@ -6,8 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/cadence/reference/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include "kernels.h"
 
 namespace impl {
 namespace reference {
diff --git a/backends/cadence/reference/operators/quantized_conv_out.cpp b/backends/cadence/reference/operators/quantized_conv_out.cpp
index 95236b4397..4bb7b12a88 100644
--- a/backends/cadence/reference/operators/quantized_conv_out.cpp
+++ b/backends/cadence/reference/operators/quantized_conv_out.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "kernels.h"
+#include <executorch/backends/cadence/reference/kernels/kernels.h>
 
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <algorithm>
diff --git a/backends/cadence/reference/operators/quantized_layer_norm.cpp b/backends/cadence/reference/operators/quantized_layer_norm.cpp
index 22075f632e..6588748d2d 100644
--- a/backends/cadence/reference/operators/quantized_layer_norm.cpp
+++ b/backends/cadence/reference/operators/quantized_layer_norm.cpp
@@ -6,8 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/cadence/reference/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include "kernels.h"
 
 #include <algorithm>
 #include <cmath>
@@ -25,7 +25,7 @@ namespace native {
 template <typename T>
 void quantized_layer_norm_(
     const Tensor& input,
-    float input_scale,
+    double input_scale,
     int64_t input_zero_point,
     const Tensor& weight,
     const Tensor& bias,
@@ -39,7 +39,7 @@ void quantized_layer_norm_(
   const float* __restrict__ weight_data = weight.const_data_ptr<float>();
   const float* __restrict__ bias_data = bias.const_data_ptr<float>();
 
-  float output_inv_scale = XT_RECIP_S(output_scale);
+  float output_inv_scale = 1.0f / output_scale;
 
   size_t last_dim = input.size(input.dim() - 1);
   size_t leading_dims = getLeadingDims(input, input.dim() - 1);
@@ -47,15 +47,14 @@ void quantized_layer_norm_(
   // Visualize the input tensor as a set of 1d vectors, and compute the
   // layer_norm for each vector.
   for (size_t i = 0; i < leading_dims; ++i) {
-    const T* __restrict__ x = in_data + i * last_dim;
-    T* __restrict__ y = out_data + i * last_dim;
+    const T* x = in_data + i * last_dim;
+    T* y = out_data + i * last_dim;
 
     // compute sum and squared sum. The fp32 sum can be approximated as:
     // (X_1 - in_zero_point) * in_scale + (X_2 - in_zero_point) * in_scale + ...
     // (X_N - in_zero_point) * in_scale.
     int32_t sum = 0;
     int32_t sq_sum = last_dim * input_zero_point * input_zero_point;
-#pragma simd
     for (size_t j = 0; j < last_dim; ++j) {
       int32_t val = x[j];
       sum += val;
@@ -64,19 +63,18 @@ void quantized_layer_norm_(
     sq_sum -= (2 * sum * input_zero_point);
     sum -= (last_dim * input_zero_point);
 
-    float mean = XT_DIV_S(XT_MUL_S(input_scale, sum), last_dim);
+    float mean = (input_scale * sum) / last_dim;
     float variance =
-        XT_DIV_S(
-            XT_MUL_S(sq_sum, XT_MUL_S(input_scale, input_scale)), last_dim) -
-        XT_MUL_S(mean, mean);
-    float inv_std = XT_RECIP_S(XT_SQRT_S(XT_ADD_S(variance, (float)eps)));
+        (sq_sum * input_scale * input_scale) / last_dim - mean * mean;
+    float inv_std = 1.0f / std::sqrt(variance + eps);
 
     // y = (x - mean) / std * kGamma + kBeta
-#pragma simd
-    for (size_t j = 0; j < last_dim; ++j) {
+    for (int j = 0; j < last_dim; ++j) {
+      // y[j] = (x[j] - mean) / std * kGamma + kBeta;
       // Since X is quantized, we dequantize it, compute fp32 result, and
       // quantize the result to an int8/uint8 value.
       float val = kernels::dequantize<T>(x[j], input_scale, input_zero_point);
+
       val = (val - mean) * inv_std * weight_data[j] + bias_data[j];
       y[j] = kernels::quantize<T>(val, output_inv_scale, output_zero_point);
     }
diff --git a/backends/cadence/reference/operators/quantized_linear_out.cpp b/backends/cadence/reference/operators/quantized_linear_out.cpp
index fa40f16427..43289b3a28 100644
--- a/backends/cadence/reference/operators/quantized_linear_out.cpp
+++ b/backends/cadence/reference/operators/quantized_linear_out.cpp
@@ -6,8 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/cadence/reference/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include "kernels.h"
 
 namespace impl {
 namespace reference {
diff --git a/backends/cadence/reference/operators/quantized_matmul_out.cpp b/backends/cadence/reference/operators/quantized_matmul_out.cpp
index 49dd222a96..d65175f8f1 100644
--- a/backends/cadence/reference/operators/quantized_matmul_out.cpp
+++ b/backends/cadence/reference/operators/quantized_matmul_out.cpp
@@ -6,8 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/cadence/reference/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include "kernels.h"
 
 namespace impl {
 namespace reference {
@@ -105,7 +105,6 @@ void inline _typed_quantized_matmul(
           out_dim);
     }
   }
-  break;
 }
 
 void quantized_matmul_out(
@@ -120,7 +119,7 @@ void quantized_matmul_out(
     int64_t out_zero_point,
     bool transposed,
     Tensor& out) {
-  if (out.scalar_type() == at::ScalarType::Byte) {
+  if (out.scalar_type() == exec_aten::ScalarType::Byte) {
     _typed_quantized_matmul<uint8_t>(
         X,
         X_zero_point,
@@ -132,7 +131,7 @@ void quantized_matmul_out(
         out_zero_point,
         transposed,
         out);
-  } else if (out.scalar_type() == at::ScalarType::Char) {
+  } else if (out.scalar_type() == exec_aten::ScalarType::Char) {
     _typed_quantized_matmul<int8_t>(
         X,
         X_zero_point,
diff --git a/backends/cadence/reference/operators/quantized_relu_out.cpp b/backends/cadence/reference/operators/quantized_relu_out.cpp
index bcfd28b5bc..ef1813f65c 100644
--- a/backends/cadence/reference/operators/quantized_relu_out.cpp
+++ b/backends/cadence/reference/operators/quantized_relu_out.cpp
@@ -6,8 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/cadence/reference/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include "kernels.h"
 
 namespace impl {
 namespace reference {
diff --git a/backends/cadence/reference/operators/targets.bzl b/backends/cadence/reference/operators/targets.bzl
new file mode 100644
index 0000000000..347d476239
--- /dev/null
+++ b/backends/cadence/reference/operators/targets.bzl
@@ -0,0 +1,20 @@
+load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    runtime.cxx_library(
+        name = "cadence_cpu_ops",
+        srcs = glob([
+            "*.cpp",
+        ]),
+        platforms = CXX,
+        deps = [
+            "//executorch/kernels/portable/cpu/util:broadcast_util",
+            "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/kernels/portable/cpu:scalar_utils",
+            "//executorch/backends/cadence/reference/kernels:cadence_kernels",
+        ],
+        visibility = [
+            "//executorch/backends/cadence/...",
+        ],
+    )
diff --git a/backends/cadence/runtime/TARGETS b/backends/cadence/runtime/TARGETS
new file mode 100644
index 0000000000..9f30cadf6f
--- /dev/null
+++ b/backends/cadence/runtime/TARGETS
@@ -0,0 +1,21 @@
+load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
+
+oncall("odai_jarvis")
+
+python_library(
+    name = "runtime",
+    srcs = [
+        "__init__.py",
+        "executor.py",
+    ] + glob([
+        "xtsc-cfg/**/*",
+    ]),
+    typing = True,
+    deps = [
+        "//caffe2:torch",
+        "//executorch/exir:lib",
+        "//executorch/sdk/bundled_program:config",
+        "//executorch/sdk/bundled_program:core",
+        "//executorch/sdk/bundled_program/serialize:lib",
+    ],
+)
diff --git a/backends/cadence/runtime/__init__.py b/backends/cadence/runtime/__init__.py
new file mode 100644
index 0000000000..802e218f0d
--- /dev/null
+++ b/backends/cadence/runtime/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+from .executor import (  # noqa: F401
+    BundledProgramManager,
+    BundledProgramTestData,
+    Executor,
+)
diff --git a/backends/cadence/runtime/executor.py b/backends/cadence/runtime/executor.py
new file mode 100644
index 0000000000..7bcf705c03
--- /dev/null
+++ b/backends/cadence/runtime/executor.py
@@ -0,0 +1,202 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+
+import logging
+import os
+import selectors
+import subprocess
+import sys
+
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import torch
+
+from executorch.exir import ExecutorchProgram, ExecutorchProgramManager
+
+from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.sdk.bundled_program.core import BundledProgram
+
+from executorch.sdk.bundled_program.serialize import (
+    serialize_from_bundled_program_to_flatbuffer,
+)
+
+# If quiet is true, suppress the printing of stdout and stderr output.
+quiet = False
+
+
+def _execute_subprocess(cmd: List[str], cwd: Optional[str] = None) -> Tuple[str, str]:
+    """
+    `subprocess.run(cmd, capture_output=True)` captures stdout/stderr and only
+    returns it at the end. This functions not only does that, but also prints out
+    stdout/stderr non-blockingly when running the command.
+    """
+    logging.debug(f"cmd = \33[33m{cmd}\33[0m, cwd = {cwd}")
+    stdout = ""
+    stderr = ""
+
+    PIPE = subprocess.PIPE
+    with subprocess.Popen(cmd, stdout=PIPE, stderr=PIPE, cwd=cwd) as p:
+        sel = selectors.DefaultSelector()
+        # pyre-fixme[6]: For 1st argument expected `Union[HasFileno, int]` but got
+        #  `Optional[IO[bytes]]`.
+        sel.register(p.stdout, selectors.EVENT_READ)
+        # pyre-fixme[6]: For 1st argument expected `Union[HasFileno, int]` but got
+        #  `Optional[IO[bytes]]`.
+        sel.register(p.stderr, selectors.EVENT_READ)
+
+        done = False
+        while not done:
+            for key, _ in sel.select():
+                # pyre-fixme[16]: Item `HasFileno` of `Union[HasFileno, int]` has no
+                #  attribute `read1`.
+                data = key.fileobj.read1().decode()
+                if not data:
+                    done = True
+                    break
+
+                if key.fileobj is p.stdout:
+                    if not quiet:
+                        print(data, end="")
+                    stdout += data
+                else:
+                    if not quiet:
+                        print(data, end="", file=sys.stderr)
+                    stderr += data
+
+    # flush stdout and stderr in case there's no newline character at the end
+    # from the subprocess
+    sys.stdout.flush()
+    sys.stderr.flush()
+
+    if p.returncode != 0:
+        raise subprocess.CalledProcessError(p.returncode, p.args, stdout, stderr)
+
+    return stdout, stderr
+
+
+def execute(args: List[str]) -> Tuple[str, str]:
+    """
+    Either a local execution (through subprocess.run) or a remote execution (in Hargow).
+    Run the command described by args (the same way subprocess.run does). Ex: if you want to
+    run "ls -al", you need to pass args = ["ls", "-al"]
+    """
+    # `import torch` will mess up PYTHONPATH. delete the messed up PYTHONPATH
+    if "PYTHONPATH" in os.environ:
+        del os.environ["PYTHONPATH"]
+
+    try:
+        return _execute_subprocess(args)
+    except subprocess.CalledProcessError as e:
+        fdb_cmd = f"fdb {' '.join(e.cmd)}"
+        raise RuntimeError(
+            f"Failed to execute. Use the following to debug:\n{fdb_cmd}"
+        ) from e
+
+
+class Executor:
+    # pyre-fixme[3]: Return type must be annotated.
+    def __init__(
+        self,
+        working_dir: str = "",
+    ):
+        self.working_dir = working_dir
+        self.executor_builder = "./backends/cadence/build_cadence_runner.sh"
+        self.execute_runner = "./cmake-out/backends/cadence/cadence_runner"
+        self.bundled_program_path: str = "CadenceDemoModel.bpte"
+
+    def __call__(self) -> None:
+        # build executor
+        args = self.get_bash_command(self.executor_builder)
+        logging.info(f"\33[33m{' '.join(args)}\33[0m")
+        execute(args)
+
+        # run executor
+        cmd_args = {
+            "bundled_program_path": os.path.join(
+                self.working_dir, self.bundled_program_path
+            ),
+            "etdump_path": os.path.join(self.working_dir, "etdump.etdp"),
+            "debug_output_path": os.path.join(self.working_dir, "debug_output.bin"),
+        }
+        args = self.get_bash_command(self.execute_runner, cmd_args)
+        logging.info(f"\33[33m{' '.join(args)}\33[0m")
+        execute(args)
+
+    @staticmethod
+    def get_bash_command(
+        executable: str,
+        cmd_args: Optional[Dict[str, str]] = None,
+    ) -> List[str]:
+        # go through buck config and turn the dict into a list of "{key}=={value}"
+        if cmd_args is None:
+            cmd_args = {}
+
+        cmd_args_strs = []
+        for key, value in cmd_args.items():
+            cmd_args_strs.extend([f"--{key}={value}"])
+
+        return [executable] + cmd_args_strs
+
+
+@dataclass
+class BundledProgramTestData:
+    method: str
+    inputs: Sequence[Union[bool, float, int, torch.Tensor]]
+    expected_outputs: Sequence[torch.Tensor]
+    testset_idx: int = 0  # There is only one testset in the bundled program
+
+
+class BundledProgramManager:
+    """
+    Stateful bundled program object
+    Takes a BundledProgramTestData and generates a bundled program
+    """
+
+    def __init__(self, bundled_program_test_data: List[BundledProgramTestData]) -> None:
+        self.bundled_program_test_data: List[BundledProgramTestData] = (
+            bundled_program_test_data
+        )
+
+    @staticmethod
+    # pyre-fixme[2]: Parameter `**args` has no type specified.
+    def bundled_program_test_data_gen(**args) -> BundledProgramTestData:
+        return BundledProgramTestData(**args)
+
+    def get_method_test_suites(self) -> List[MethodTestSuite]:
+        return [
+            self._gen_method_test_suite(bptd) for bptd in self.bundled_program_test_data
+        ]
+
+    def _gen_method_test_suite(self, bptd: BundledProgramTestData) -> MethodTestSuite:
+        method_test_case = MethodTestCase(
+            inputs=bptd.inputs,
+            expected_outputs=bptd.expected_outputs,
+        )
+        return MethodTestSuite(
+            method_name=bptd.method,
+            test_cases=[method_test_case],
+        )
+
+    def _serialize(
+        self,
+        executorch_program: Union[
+            ExecutorchProgram,
+            ExecutorchProgramManager,
+        ],
+        method_test_suites: Sequence[MethodTestSuite],
+        bptd: BundledProgramTestData,
+    ) -> bytes:
+        bundled_program = BundledProgram(
+            executorch_program=executorch_program, method_test_suites=method_test_suites
+        )
+        bundled_program_buffer = serialize_from_bundled_program_to_flatbuffer(
+            bundled_program
+        )
+        return bundled_program_buffer
diff --git a/backends/cadence/runtime/executor_main.sh b/backends/cadence/runtime/executor_main.sh
new file mode 100644
index 0000000000..c850ab8b4a
--- /dev/null
+++ b/backends/cadence/runtime/executor_main.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Test the end-to-end flow of building sdk_example_runner and use it to run
+# an actual model.
+
+
+set -e
+
+# shellcheck source=/dev/null
+source "$(dirname "${BASH_SOURCE[0]}")/../../.ci/scripts/utils.sh"
+
+cmake_install_executorch_sdk_lib() {
+  echo "Installing libexecutorch.a, libportable_kernels.a, libetdump.a, libbundled_program.a"
+  rm -rf cmake-out
+
+  retry cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DEXECUTORCH_BUILD_SDK=ON \
+          -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
+          -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+          -Bcmake-out .
+  cmake --build cmake-out -j9 --target install --config Release
+}
+
+test_cmake_sdk_example_runner() {
+  local example_dir=examples/sdk
+  local build_dir=cmake-out/${example_dir}
+  CMAKE_PREFIX_PATH="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
+  rm -rf ${build_dir}
+  retry cmake \
+        -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+        -B${build_dir} \
+        ${example_dir}
+
+  echo "Building ${example_dir}"
+  cmake --build ${build_dir} -j9 --config Release
+
+  echo 'Running sdk_example_runner'
+  ${build_dir}/sdk_example_runner --bundled_program_path="./CadenceDemoModel.bpte"
+}
+
+if [[ -z $PYTHON_EXECUTABLE ]];
+then
+  PYTHON_EXECUTABLE=python3
+fi
+
+if [[ -z $BUCK ]];
+then
+  BUCK=buck2
+fi
+
+cmake_install_executorch_sdk_lib
+test_cmake_sdk_example_runner
diff --git a/backends/cadence/runtime/runtime.py b/backends/cadence/runtime/runtime.py
new file mode 100644
index 0000000000..ec282f8f7b
--- /dev/null
+++ b/backends/cadence/runtime/runtime.py
@@ -0,0 +1,241 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import logging
+import numbers
+import os
+import tempfile
+from typing import Any, Optional, Sequence, Tuple, Union
+
+import executorch.exir.schema as et_schema
+
+import numpy as np
+import torch
+
+from executorch.backends.cadence.runtime import utils
+from executorch.backends.cadence.runtime.executor import Executor
+from executorch.exir import ExecutorchProgramManager
+from executorch.exir._serialize._program import deserialize_pte_binary
+from executorch.exir.schema import DataLocation
+from executorch.sdk import Inspector
+
+from numpy import ndarray
+
+from torch.utils._pytree import TreeSpec
+
+
+class JarvisETDump:
+    def __init__(self, output_dir: str) -> None:
+        self.tensor_dump_dir: str = os.path.join(output_dir, "tensors")
+        self.etdump_path: str = os.path.join(output_dir, "etdump.etdp")
+        self.etrecord_path: Optional[str] = os.path.join(output_dir, "etrecord.bin")
+        self.debug_buffer_path: Optional[str] = os.path.join(
+            output_dir, "debug_output.bin"
+        )
+
+        if not os.path.exists(self.etdump_path):
+            raise RuntimeError(f"{self.etdump_path} does not exist")
+        # pyre-ignore[6]: os.path.exists expects str, but got Optional[str]
+        if not os.path.exists(self.etrecord_path):
+            logging.warning(
+                "ETRecord not found, intermediate tensors will not be dumped"
+            )
+            self.etrecord_path = None
+        # pyre-ignore[6]: os.path.exists expects str, but got Optional[str]
+        if not os.path.exists(self.debug_buffer_path):
+            logging.warning(
+                "Debug buffer not found, intermediate tensors will not be dumped"
+            )
+            self.debug_buffer_path = None
+
+        self.et_inspector: Inspector = Inspector(
+            etdump_path=self.etdump_path,
+            debug_buffer_path=self.debug_buffer_path,
+            etrecord=self.etrecord_path,
+        )
+
+    def get_outputs(self, log_to_stdout: bool = False) -> Tuple[torch.Tensor]:
+        output = [
+            event_block.run_output
+            for event_block in self.et_inspector.event_blocks
+            if event_block.name == "Execute"
+        ]
+        logging.debug(f"[Jarvis][ETdump] output: {output}")
+        return output[0]
+
+    def print_event_block(self) -> None:
+        logging.debug("[Jarvis][ETdump] data tabular:")
+        if logging.getLogger().level <= logging.DEBUG:
+            self.et_inspector.print_data_tabular()
+
+    def print_event_data(self) -> None:
+        logging.debug("[Jarvis][ETdump] event data ")
+        for event_block in self.et_inspector.event_blocks:
+            for event in event_block.events:
+                logging.debug(event)
+
+    def dump_intermediate_tensors(self) -> None:
+        if self.etrecord_path is None:
+            logging.info("[Jarvis][ETdump] Intermediate tensors not available")
+            return
+
+        logging.info(
+            f"[Jarvis][ETdump] Dumping intermediate tensors to {self.tensor_dump_dir}"
+        )
+        os.makedirs(self.tensor_dump_dir, exist_ok=True)
+        exec_blocks = [
+            eb for eb in self.et_inspector.event_blocks if eb.name == "Execute"
+        ]
+        if len(exec_blocks) > 1:
+            logging.warning(
+                f'Found {len(exec_blocks)} "Execute" blocks, using the first one and ignoring the rest.'
+            )
+        block = exec_blocks[0]
+
+        # OPERATOR_CALL events are duplicates that contain framework tax data. We don't need them
+        op_events = [e for e in block.events if e.name != "OPERATOR_CALL"]
+        torch.set_printoptions(profile="full")
+
+        for event in op_events:
+            instr_id = event._instruction_id
+            if not event.debug_data:
+                logging.debug(
+                    f"Missing intermediate tensor data for {event.name} ({instr_id=})"
+                )
+                continue
+
+            with open(f"{self.tensor_dump_dir}/{instr_id}.txt", "w") as f:
+                for dd in event.debug_data:
+                    f.write(f"{str(dd)}\n\n")
+        torch.set_printoptions(profile="default")
+
+
+def get_op_names(program: et_schema.Program, execution_plan_id: int = 0) -> set[str]:
+    """
+    Get the list of operators from a Program
+    """
+
+    op_names = {
+        f"{op.name}.{op.overload}"
+        for op in program.execution_plan[execution_plan_id].operators
+    }
+    for delegate in program.execution_plan[execution_plan_id].delegates:
+        logging.debug(f"Delegate: {delegate.id}")
+        if delegate.id == "CadenceExecutorchBackend":
+            assert delegate.processed.location == DataLocation.INLINE
+            op_names |= get_op_names(
+                deserialize_pte_binary(
+                    program.backend_delegate_data[delegate.processed.index].data
+                )
+            )
+    return op_names
+
+
+# Run an ExecutorchProgram using the specified inputs and backend
+def run(
+    executorch_prog: ExecutorchProgramManager,
+    inputs: Any,
+    ref_outputs: Optional[Sequence[torch.Tensor]] = None,
+    working_dir: Optional[str] = None,
+) -> Any:
+    # Get the Program
+    program = executorch_prog.executorch_program
+    out_spec = executorch_prog.exported_program().call_spec.out_spec
+    # Run the program and return the outputs
+    assert isinstance(
+        program, et_schema.Program
+    ), f"program must be Program. Got {type(program)} instead."
+
+    if working_dir is None:
+        working_dir = tempfile.mkdtemp(dir="/tmp")
+
+    # initialize Jarvis e2e Executor with executorch_cfg.
+    executor = Executor(working_dir)
+
+    # run Executor
+    executor()
+
+    etdump = JarvisETDump(output_dir=working_dir)
+    outputs = etdump.get_outputs()
+
+    assert isinstance(out_spec, TreeSpec)
+    outputs = torch.utils._pytree.tree_unflatten(outputs, out_spec)
+
+    return outputs
+
+
+def compare(
+    # pyre-fixme[2]: Parameter annotation cannot be `Any`.
+    outputs: Any,
+    # pyre-fixme[2]: Parameter annotation cannot be `Any`.
+    ref_outputs: Any,
+    name: str = "",
+    eps_error: float = 1e-1,
+    eps_warn: float = 1e-5,
+) -> None:
+    if isinstance(ref_outputs, dict):
+        for k, v in outputs.items():
+            compare(v, ref_outputs[k], f"{name}/{k}", eps_error, eps_warn)
+        return
+
+    if isinstance(ref_outputs, (list, tuple)):
+        for i, (output, ref_output) in enumerate(zip(outputs, ref_outputs)):
+            compare(output, ref_output, f"{name}/{i}", eps_error, eps_warn)
+        return
+
+    assert isinstance(ref_outputs, torch.Tensor), f"Got {type(ref_outputs)} instead."
+
+    ref_outputs = to_nd_array(ref_outputs)
+    outputs = to_nd_array(outputs)
+
+    # compare
+    rms = utils.rms(outputs, ref_outputs)
+    norm_rms = utils.normalized_rms(outputs, ref_outputs)
+    max_abs_diff = utils.max_abs_diff(outputs, ref_outputs)
+    max_rel_diff = utils.max_rel_diff(outputs, ref_outputs)
+    stats = (
+        f"{rms = }, {norm_rms = }, {max_abs_diff = }, {max_rel_diff = :.2f}%, "
+        f"{outputs.shape = }[{outputs.dtype}], {ref_outputs.shape = }[{ref_outputs.dtype}]"
+    )
+
+    if np.isnan(rms) or rms > eps_error:
+        logging.error(f"\33[31m[Error]\33[0m Output {name} mismatched! {stats}")
+        logging.error(f"Expected: {ref_outputs}\n")
+        logging.error(f"Got instead: {outputs}\n")
+        raise RuntimeError(f"\33[31m[Error]\33[0m Output {name} mismatched! {stats}")
+    elif rms > eps_warn:
+        logging.warning(f"\33[33m[Warning]\33[0m Output {name} mismatched!. {stats}")
+    else:
+        logging.info(f"\33[32m[Passed]\33[0m Output {name} matched. {stats}")
+
+
+def run_and_compare(
+    executorch_prog: ExecutorchProgramManager,
+    inputs: Any,
+    ref_outputs: Optional[Sequence[torch.Tensor]] = None,
+    working_dir: Optional[str] = None,
+    eps_error: float = 1e-1,
+    eps_warn: float = 1e-5,
+) -> Any:
+    outputs = run(executorch_prog, inputs, ref_outputs, working_dir)
+    compare(outputs, ref_outputs, eps_error=eps_error, eps_warn=eps_warn)
+
+
+# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
+def to_nd_array(v: Union[bool, numbers.Number, ndarray, torch.Tensor]) -> np.ndarray:
+    if isinstance(v, np.ndarray):
+        return v
+
+    if isinstance(v, torch.Tensor):
+        # If v was quantized, we compare its int representation.
+        v = v.int_repr() if v.is_quantized else v
+        return v.cpu().detach().numpy()
+
+    if isinstance(v, (numbers.Number, bool)):
+        return np.array([v])
+
+    raise RuntimeError(f"Unknown type {type(v)}")
diff --git a/backends/cadence/runtime/utils.py b/backends/cadence/runtime/utils.py
new file mode 100644
index 0000000000..b3ed622e8b
--- /dev/null
+++ b/backends/cadence/runtime/utils.py
@@ -0,0 +1,108 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import logging
+import typing
+from typing import Callable, Union
+
+import numpy as np
+import torch
+
+
+# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
+def distance(fn: Callable[[np.ndarray, np.ndarray], float]) -> Callable[
+    [
+        # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
+        typing.Union[np.ndarray, torch._tensor.Tensor],
+        # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
+        typing.Union[np.ndarray, torch._tensor.Tensor],
+    ],
+    float,
+]:
+    # A distance decorator that performs all the necessary checkes before calculating
+    # the distance between two N-D tensors given a function. This can be a RMS
+    # function, maximum abs diff, or any kind of distance function.
+    def wrapper(
+        # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
+        a: Union[np.ndarray, torch.Tensor],
+        # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
+        b: Union[np.ndarray, torch.Tensor],
+    ) -> float:
+        # convert a and b to np.ndarray type fp64
+        a = to_np_arr_fp64(a)
+        b = to_np_arr_fp64(b)
+
+        # return NaN if shape mismatches
+        if a.shape != b.shape:
+            return np.nan
+
+        # After we make sure shape matches, check if it's empty. If yes, return 0
+        if a.size == 0:
+            return 0
+
+        # np.isinf and np.isnan returns a Boolean mask. Check if Inf or NaN occur at
+        # the same places in a and b. If not, return NaN
+        if np.any(np.isinf(a) != np.isinf(b)) or np.any(np.isnan(a) != np.isnan(b)):
+            return np.nan
+
+        # mask out all the values that are either Inf or NaN
+        mask = np.isinf(a) | np.isnan(a)
+        if np.any(mask):
+            logging.warning("Found inf/nan in tensor when calculating the distance")
+
+        a_masked = a[~mask]
+        b_masked = b[~mask]
+
+        # after masking, the resulting tensor might be empty. If yes, return 0
+        if a_masked.size == 0:
+            return 0
+
+        # only compare the rest (those that are actually numbers) using the metric
+        return fn(a_masked, b_masked)
+
+    return wrapper
+
+
+@distance
+# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
+def rms(a: np.ndarray, b: np.ndarray) -> float:
+    return ((a - b) ** 2).mean() ** 0.5
+
+
+@distance
+# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
+def max_abs_diff(a: np.ndarray, b: np.ndarray) -> float:
+    return np.abs(a - b).max()
+
+
+@distance
+# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
+def max_rel_diff(x: np.ndarray, x_ref: np.ndarray) -> float:
+    return np.abs((x - x_ref) / x_ref).max()
+
+
+# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
+def to_np_arr_fp64(x: Union[np.ndarray, torch.Tensor]) -> np.ndarray:
+    if isinstance(x, torch.Tensor):
+        x = x.detach().cpu().numpy()
+    if isinstance(x, np.ndarray):
+        x = x.astype(np.float64)
+    return x
+
+
+# pyre-fixme[3]: Return type must be annotated.
+def normalized_rms(
+    # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
+    predicted: Union[np.ndarray, torch.Tensor],
+    # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
+    ground_truth: Union[np.ndarray, torch.Tensor],
+):
+    num = rms(predicted, ground_truth)
+    if num == 0:
+        return 0
+    den = np.linalg.norm(to_np_arr_fp64(ground_truth))
+    return np.float64(num) / np.float64(den)
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index cefc330d3d..babdb96d8b 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -58,6 +58,7 @@ add_compile_options("-Wall" "-Werror" "-Wno-sign-compare")
 # which can be ignored by GNU. So we make it a warning, not an error in GNU.
 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
   add_compile_options("-Wno-error=attributes")
+  add_link_options("-flto=auto")
 endif()
 
 if(CMAKE_BUILD_TYPE STREQUAL "Release")
@@ -67,7 +68,6 @@ if(CMAKE_BUILD_TYPE STREQUAL "Release")
   # --gc-sections is added by torch.
   add_compile_options(
     "-O3" "-ffunction-sections" "-fdata-sections" "-frtti"
-    "-Wno-unused-command-line-argument"
   )
 endif()
 
@@ -259,6 +259,22 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
     pybind11_strip(PyQnnWrapperAdaptor)
   endif()
 
+  if(CMAKE_BUILD_TYPE STREQUAL "Release")
+    # need to allow exceptions in pybind
+    set(_pybind_compile_options
+      -Wno-deprecated-declarations
+      -fPIC
+      -frtti
+      -fexceptions
+    )
+    target_compile_options(
+      PyQnnManagerAdaptor PUBLIC ${_pybind_compile_options}
+    )
+    target_compile_options(
+      PyQnnWrapperAdaptor PUBLIC ${_pybind_compile_options}
+    )
+  endif()
+
   add_subdirectory(
     ${QNN_EXECUTORCH_ROOT_DIR}/aot/python
     ${CMAKE_CURRENT_BINARY_DIR}/qnn_executorch/python
diff --git a/backends/qualcomm/README.md b/backends/qualcomm/README.md
index 618a1f3e32..3c0fdd8f98 100644
--- a/backends/qualcomm/README.md
+++ b/backends/qualcomm/README.md
@@ -1,12 +1,14 @@
 # Qualcomm AI Engine Direct Backend
 
 Disclaimer: At present, we do not offer any backward compatibility guarantees
-for any APIs. We are currently in a pre-alpha development phase, and as such,
+for any APIs. We are currently in a development phase, and as such,
 we reserve the right to modify interfaces and implementations.
 
 This backend is implemented on the top of
 [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk).
-Please follow [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to setup environment, build, and run executorch models by this backend (Qualcomm AI Engine Direct is also referred to as QNN in the source and documentation).
+Please follow [tutorial](../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md) to setup environment, build, and run executorch models by this backend (Qualcomm AI Engine Direct is also referred to as QNN in the source and documentation).
+
+A website version of the tutorial is [here](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html).
 
 ## Delegate Options
 
@@ -29,7 +31,7 @@ Add SoC model into QcomChipset enum in [schema](./serialization/schema.fbs) and
 Insert new SoC information into _soc_info_table in [qnn_compile_spec_schema](./serialization/qnn_compile_spec_schema.py).
 
 #### Step 3: Recompile the .pte file
-Follow [setup](setup.md) to setup environment and build runtime with new schema header.
+Follow [setup](../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md) to setup environment and build runtime with new schema header.
 
 ### Supported Inference Type
 - Quantized
@@ -46,6 +48,7 @@ backends/qualcomm
 ├── partition # QNN Partitioner (AoT Part).
 ├── passes # Various passes helping lower models to QNN backend (AoT Part).
 ├── python # Places to put pybind artifacts for accessing QNN APIs, structures, etc (AoT Part).
+├── quantizer # QNN Quantizer
 ├── runtime # Here is QNN runtime responsbile for compiling a model on x64.
 |   |       # Meanwhile, this is also the runtime responsbile for executing compiled
 |   |       # models on a device.
@@ -58,8 +61,11 @@ backends/qualcomm
 ├── tests # Unit tests and model tests go here.
 └── utils # Miscellaneous utilities.
 
-examples
-└── qualcomm # Examples to run QNN backends.
+examples/qualcomm
+├── executor_runner # A general runner that is capable of running most of the basic models.
+├── oss_scripts # Scripts for OSS(Open Source Software) models and customized runner for some specific models.
+├── qaihub_scripts # Scripts for Qaihub models and corresponding customized runner for these models.
+└── scripts # Scripts for models provided by executorch.
 ```
 
 ## Examples
diff --git a/backends/qualcomm/aot/ir/qcir_utils.h b/backends/qualcomm/aot/ir/qcir_utils.h
index a6b5e50c23..4092908ced 100755
--- a/backends/qualcomm/aot/ir/qcir_utils.h
+++ b/backends/qualcomm/aot/ir/qcir_utils.h
@@ -8,8 +8,8 @@
 
 #pragma once
 
-#include "QnnTypes.h"
 #include <executorch/backends/qualcomm/aot/ir/qcir_generated.h>
+#include "QnnTypes.h"
 
 namespace torch {
 namespace executor {
diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py
index 641e2445f2..e07a745df5 100644
--- a/backends/qualcomm/builders/node_visitor.py
+++ b/backends/qualcomm/builders/node_visitor.py
@@ -12,13 +12,20 @@
 import numpy as np
 import torch
 from executorch.backends.qualcomm.utils.constants import (
+    QCOM_AXIS,
     QCOM_AXIS_ORDER,
     QCOM_BITWIDTH,
+    QCOM_DTYPE,
     QCOM_ENCODING,
+    QCOM_OFFSET,
     QCOM_QUANT_ATTRS,
+    QCOM_QUANT_MAX,
+    QCOM_QUANT_MIN,
     QCOM_REQUANTIZE,
+    QCOM_SCALE,
     QCOM_SCALE_OFFSET,
     QCOM_SCALES,
+    QCOM_ZERO_POINT,
     QCOM_ZERO_POINTS,
 )
 
@@ -125,16 +132,16 @@ def make_qnn_per_channel_config(self, node: torch.fx.Node, quant_attrs: Dict):
             "convolution" in user_0.target.__name__
             and list(node.users)[0].args[1] == node
         ):
-            quant_config["axis"] = 3
+            quant_config[QCOM_AXIS] = 3
 
         else:
-            quant_config["axis"] = quant_attrs["axis"]
+            quant_config[QCOM_AXIS] = quant_attrs[QCOM_AXIS]
 
         quant_config[QCOM_SCALE_OFFSET] = scale_offset
         # special case for 4 bits
         if (
-            quant_config["dtype"] == torch.int8
-            and quant_config["quant_max"] - quant_config["quant_min"] <= 15
+            quant_config[QCOM_DTYPE] == torch.int8
+            and quant_config[QCOM_QUANT_MAX] - quant_config[QCOM_QUANT_MIN] <= 15
         ):
             quant_config[QCOM_BITWIDTH] = 4
             return (
@@ -149,11 +156,11 @@ def make_qnn_per_channel_config(self, node: torch.fx.Node, quant_attrs: Dict):
     def make_qnn_per_tensor_config(self, quant_attrs: Dict):
         quant_config = copy.deepcopy(quant_attrs)
         # check Qnn_ScaleOffset_t in QNN/include/QnnTypes.h
-        quant_config["offset"] = -quant_attrs["zero_point"]
+        quant_config[QCOM_OFFSET] = -quant_attrs[QCOM_ZERO_POINT]
         # special case for 4 bits
         if (
-            quant_config["dtype"] == torch.int8
-            and quant_config["quant_max"] - quant_config["quant_min"] <= 15
+            quant_config[QCOM_DTYPE] == torch.int8
+            and quant_config[QCOM_QUANT_MAX] - quant_config[QCOM_QUANT_MIN] <= 15
         ):
             quant_config[QCOM_BITWIDTH] = 4
             return (
@@ -187,15 +194,15 @@ def get_quant_tensor_value(
         self, tensor: torch.Tensor, quant_attrs: Dict, quant_configs: Dict
     ) -> torch.Tensor:
         if quant_attrs[QCOM_ENCODING] in PER_TENSOR_ENCODING:
-            scale = quant_attrs["scale"]
-            zero_point = quant_attrs["zero_point"]
+            scale = quant_attrs[QCOM_SCALE]
+            zero_point = quant_attrs[QCOM_ZERO_POINT]
         else:  # per channel case
             scale = quant_attrs[QCOM_SCALES]
             zero_point = quant_attrs[QCOM_ZERO_POINTS]
 
-        dtype = quant_configs["dtype"]
+        dtype = quant_configs[QCOM_DTYPE]
 
-        tensor = tensor.div(scale).add(zero_point).round().to(dtype)
+        tensor = tensor.div(scale + 1e-6).add(zero_point).round().to(dtype)
         # Make the backends access data correctly
         if quant_configs.get(QCOM_BITWIDTH) == 4:
             mask = torch.full(tensor.size(), 0x0F, dtype=torch.int8)
@@ -233,8 +240,8 @@ def get_data_type(
         quant_config: Dict,
     ) -> PyQnnWrapper.Qnn_TensorType_t:
         if quant_config:
-            quant_config["dtype"] = deduce_dtype(tensor, quant_config)
-            return QNN_QUANT_TYPE_MAP[quant_config["dtype"]]
+            quant_config[QCOM_DTYPE] = deduce_dtype(tensor, quant_config)
+            return QNN_QUANT_TYPE_MAP[quant_config[QCOM_DTYPE]]
 
         return QNN_TENSOR_TYPE_MAP[tensor.dtype]
 
diff --git a/backends/qualcomm/builders/op_conv2d.py b/backends/qualcomm/builders/op_conv2d.py
index 4b58edbac6..909cc6a21f 100644
--- a/backends/qualcomm/builders/op_conv2d.py
+++ b/backends/qualcomm/builders/op_conv2d.py
@@ -10,7 +10,16 @@
 
 import numpy as np
 import torch
-from executorch.backends.qualcomm.utils.constants import QCOM_DATA
+from executorch.backends.qualcomm.utils.constants import (
+    QCOM_DATA,
+    QCOM_DTYPE,
+    QCOM_QUANT_ATTRS,
+    QCOM_QUANT_MAX,
+    QCOM_QUANT_MIN,
+    QCOM_SCALE,
+    QCOM_ZERO_POINT,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
 
 from .node_visitor import NodeVisitor, register_node_visitor
 from .qnn_constants import (
@@ -85,6 +94,52 @@ def _add_conv_op_parameter(
 
         return conv_op
 
+    def _get_bias_tensor(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[str, PyQnnWrapper.TensorWrapper],
+        num_output_channel: int,
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        # build dummy node if bias is not given
+        bias_node = (
+            node.args[2]
+            if node.args[2] is not None
+            else torch.fx.Node(
+                node.graph,
+                node.name + "_runtime_bias",
+                "call_function",
+                exir_ops.edge.aten.full.default,
+                (),  # args
+                {},  # kwargs
+            )
+        )
+        # zeros tensor to meet HTP constraint if bias is not given
+        bias_tensor = (
+            get_parameter(bias_node, self.edge_program)
+            if node.args[2] is not None
+            else torch.zeros(num_output_channel)
+        )
+        # insert quant attribute to meet HTP constraint if bias is not given
+        if (
+            node.args[2] is None
+            and (bias_quant_attrs := node.meta.get(QCOM_QUANT_ATTRS)) is not None
+        ):
+            quant_attrs = bias_quant_attrs.copy()
+            quant_attrs[QCOM_ZERO_POINT] = 0
+            quant_attrs[QCOM_SCALE] = 0
+            quant_attrs[QCOM_DTYPE] = torch.int32
+            quant_attrs[QCOM_QUANT_MAX] = torch.iinfo(torch.int32).max
+            quant_attrs[QCOM_QUANT_MIN] = torch.iinfo(torch.int32).min + 1
+            bias_node.meta[QCOM_QUANT_ATTRS] = quant_attrs
+
+        return self.define_tensor(
+            bias_node,
+            bias_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+            nodes_to_wrappers,
+            is_input_tensor=False,
+        )
+
     def _define_conv1d(
         self,
         node: torch.fx.Node,
@@ -149,17 +204,9 @@ def _define_conv1d(
             is_input_tensor=False,
         )
         conv_input_tensors = [unsqueeze_output_tensor_wrapper, filter_tensor_wrapper]
-        if node.args[2] is not None:
-            bias_node = node.args[2]
-            bias_tensor = get_parameter(bias_node, self.edge_program)
-            bias_tensor_wrapper = self.define_tensor(
-                bias_node,
-                bias_tensor,
-                PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
-                nodes_to_wrappers,
-                is_input_tensor=False,
-            )
-            conv_input_tensors.append(bias_tensor_wrapper)
+        conv_input_tensors.append(
+            self._get_bias_tensor(node, nodes_to_wrappers, filter_tensor.shape[-1])
+        )
 
         stride = [1] + cast(List[int], node.args[3])
         padding = [0] + cast(List[int], node.args[4])
@@ -265,18 +312,9 @@ def define_node(
             is_input_tensor=False,
         )
         conv_input_tensors = [input_tensor_wrapper, filter_tensor_wrapper]
-
-        if node.args[2] is not None:
-            bias_node = node.args[2]
-            bias_tensor = get_parameter(bias_node, self.edge_program)
-            bias_tensor_wrapper = self.define_tensor(
-                bias_node,
-                bias_tensor,
-                PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
-                nodes_to_wrappers,
-                is_input_tensor=False,
-            )
-            conv_input_tensors.append(bias_tensor_wrapper)
+        conv_input_tensors.append(
+            self._get_bias_tensor(node, nodes_to_wrappers, filter_tensor.shape[-1])
+        )
 
         output_tensor = self.get_tensor(node, node)
         output_tensor_wrapper = self.define_tensor(
diff --git a/backends/qualcomm/builders/op_prelu.py b/backends/qualcomm/builders/op_prelu.py
index fc0c6b9232..5da017b8b7 100644
--- a/backends/qualcomm/builders/op_prelu.py
+++ b/backends/qualcomm/builders/op_prelu.py
@@ -11,6 +11,10 @@
 from executorch.backends.qualcomm.utils.constants import (
     QCOM_AXIS_ORDER,
     QCOM_QUANT_ATTRS,
+    QCOM_QUANT_MAX,
+    QCOM_QUANT_MIN,
+    QCOM_SCALE,
+    QCOM_ZERO_POINT,
 )
 from executorch.exir.dialects._ops import ops as exir_ops
 
@@ -77,10 +81,10 @@ def define_node(
         )
         if pow_quant_attrs := node.meta.get(QCOM_QUANT_ATTRS):
             quant_attrs = pow_quant_attrs.copy()
-            quant_range = quant_attrs["quant_max"] - quant_attrs["quant_min"]
+            quant_range = quant_attrs[QCOM_QUANT_MAX] - quant_attrs[QCOM_QUANT_MIN]
             # coeff is guaranteed to be positive
-            quant_attrs["zero_point"] = 0
-            quant_attrs["scale"] = coeff / quant_range
+            quant_attrs[QCOM_ZERO_POINT] = 0
+            quant_attrs[QCOM_SCALE] = coeff / quant_range
             scalar_node.meta[QCOM_QUANT_ATTRS] = quant_attrs
 
         scalar_tensor_wrapper = self.define_tensor(
diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py
index c60afc2dd3..353169bc18 100644
--- a/backends/qualcomm/partition/common_defs.py
+++ b/backends/qualcomm/partition/common_defs.py
@@ -16,6 +16,10 @@
     exir_ops.edge.aten.copy.default,
 ]
 
+to_be_implemented_operator = [
+    exir_ops.edge.aten.where.default,
+]
+
 allow_list_operator = [
     _operator.getitem,
 ]
diff --git a/backends/qualcomm/partition/qnn_partitioner.py b/backends/qualcomm/partition/qnn_partitioner.py
index c3afc23dae..86028d0d44 100644
--- a/backends/qualcomm/partition/qnn_partitioner.py
+++ b/backends/qualcomm/partition/qnn_partitioner.py
@@ -27,7 +27,11 @@
 from torch.fx.passes.infra.partitioner import Partition
 from torch.fx.passes.operator_support import OperatorSupportBase
 
-from .common_defs import allow_list_operator, not_supported_operator
+from .common_defs import (
+    allow_list_operator,
+    not_supported_operator,
+    to_be_implemented_operator,
+)
 
 
 class QnnOperatorSupport(OperatorSupportBase):
@@ -62,6 +66,12 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool:
         if node.op != "call_function" or node.target in not_supported_operator:
             return False
 
+        if node.target in to_be_implemented_operator:
+            print(
+                f"[QNN Partitioner Op Support]: {node.target.__name__} | Skipped, this op can be supported, please report an issue in https://github.com/pytorch/executorch/issues"
+            )
+            return False
+
         if node.target in allow_list_operator:
             return True
 
diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh
index bed9a2b4ff..be317a2d64 100755
--- a/backends/qualcomm/scripts/build.sh
+++ b/backends/qualcomm/scripts/build.sh
@@ -16,7 +16,7 @@ usage() {
   echo "Usage: Build the aarch64 version of executor runner or the python interface of Qnn Manager"
   echo "First, you need to set the environment variable for QNN_SDK_ROOT"
   echo ", and if you want to build the aarch64 version of executor runner"
-  echo ", you need to set ANDROID_NDK_ROOT"
+  echo ", you need to export ANDROID_NDK_ROOT=/path/to/android_ndkXX"
   echo "e.g.: executorch$ ./backends/qualcomm/scripts/build.sh --skip_x86_64"
   exit 1
 }
@@ -25,9 +25,9 @@ usage() {
 [ "$1" = -h ] && usage
 
 BUILD_X86_64="true"
-CMAKE_X86_64="build_x86_64"
+CMAKE_X86_64="cmake-out"
 BUILD_AARCH64="true"
-CMAKE_AARCH64="build_android"
+CMAKE_AARCH64="cmake-out-android"
 CLEAN="true"
 BUILD_TYPE="Debug"
 BUILD_JOB_NUMBER="16"
@@ -61,12 +61,16 @@ PRJ_ROOT="$( cd "$(dirname "$0")/../../.." ; pwd -P)"
 
 if [ "$BUILD_AARCH64" = true ]; then
     if [[ -z ${ANDROID_NDK_ROOT} ]]; then
-        echo "Please export ANDROID_NDK_ROOT=/path/to/android_ndk"
+        echo "Please export ANDROID_NDK_ROOT=/path/to/android_ndkXX"
         exit -1
     fi
+
     BUILD_ROOT=$PRJ_ROOT/$CMAKE_AARCH64
     if [ "$CLEAN" = true ]; then
         rm -rf $BUILD_ROOT && mkdir $BUILD_ROOT
+    else
+        # Force rebuild flatccrt for the correct platform
+        cd $BUILD_ROOT/sdk && make clean
     fi
 
     cd $BUILD_ROOT
@@ -103,15 +107,17 @@ if [ "$BUILD_AARCH64" = true ]; then
 fi
 
 if [ "$BUILD_X86_64" = true ]; then
-    # Build python interface
     BUILD_ROOT=$PRJ_ROOT/$CMAKE_X86_64
     if [ "$CLEAN" = true ]; then
         rm -rf $BUILD_ROOT && mkdir $BUILD_ROOT
+    else
+        # Force rebuild flatccrt for the correct platform
+        cd $BUILD_ROOT/sdk && make clean
     fi
+
     cd $BUILD_ROOT
-    # TODO: Use CMAKE_BUILD_TYPE=RelWithDebInfo, and handle flatcc issues
     cmake \
-        -DCMAKE_BUILD_TYPE=Debug \
+        -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
         -DCMAKE_INSTALL_PREFIX=$BUILD_ROOT \
         -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
         -DEXECUTORCH_BUILD_QNN=ON \
@@ -131,7 +137,7 @@ if [ "$BUILD_X86_64" = true ]; then
    CMAKE_PREFIX_PATH="${BUILD_ROOT}/lib/cmake/ExecuTorch;${BUILD_ROOT}/third-party/gflags;"
 
    cmake $PRJ_ROOT/$EXAMPLE_ROOT \
-       -DCMAKE_BUILD_TYPE=Debug \
+       -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
        -DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
        -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
        -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
diff --git a/backends/qualcomm/setup.md b/backends/qualcomm/setup.md
index b4b0f2ea72..37d8e04c21 100644
--- a/backends/qualcomm/setup.md
+++ b/backends/qualcomm/setup.md
@@ -1,189 +1,7 @@
 # Setting up QNN Backend
 
-This is a tutorial for building and running Qualcomm AI Engine Direct backend,
+Please refer to [Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend](../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md).
+
+That is a tutorial for building and running Qualcomm AI Engine Direct backend,
 including compiling a model on a x64 host and running the inference
 on a Android device.
-
-
-## Prerequisite
-
-Please finish tutorial [Setting up executorch](../../docs/source/getting-started-setup.md).
-
-
-## Conventions
-
-`$QNN_SDK_ROOT` refers to the root of Qualcomm AI Engine Direct SDK,
-i.e., the directory containing `QNN_README.txt`.
-
-`$ANDROID_NDK_ROOT` refers to the root of Android NDK.
-
-`$EXECUTORCH_ROOT` refers to the root of executorch git repository.
-
-
-## Environment Setup
-
-### Download Qualcomm AI Engine Direct SDK
-
-Navigate to [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk) and follow the download button.
-
-You might need to apply for a Qualcomm account to download the SDK.
-
-After logging in, search Qualcomm AI Stack at the *Tool* panel.
-You can find Qualcomm AI Engine Direct SDK under the AI Stack group.
-
-Please download the Linux version, and follow instructions on the page to
-extract the file.
-
-The SDK should be installed to somewhere `/opt/qcom/aistack/qnn` by default.
-
-### Download Android NDK
-
-Please navigate to [Android NDK](https://developer.android.com/ndk) and download
-a version of NDK. We recommend LTS version, currently r25c.
-
-### Setup environment variables
-
-We need to make sure Qualcomm AI Engine Direct libraries can be found by
-the dynamic linker on x64. Hence we set `LD_LIBRARY_PATH`. In production,
-we recommend users to put libraries in default search path or use `rpath`
-to indicate the location of libraries.
-
-Further, we set up `$PYTHONPATH` because it's easier to develop and import executorch Python APIs. Users might also build and install executorch package as usual python package.
-
-```bash
-export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/:$LD_LIBRARY_PATH
-export PYTHONPATH=$EXECUTORCH_ROOT/..
-```
-
-Note: Since we set `PYTHONPATH`, we may have issue with finding `program.fbs`
-and `scalar_type.fbs` when we export a model, because they are installed into
-`pip-out` directory with the same package name pattern. A workaround is that
-we copy `$EXECUTORCH_ROOT/pip-out/lib.linux-x86_64-cpython-310/executorch/exir/_serialize/program.fbs`
-and `$EXECUTORCH_ROOT/pip-out/lib.linux-x86_64-cpython-310/executorch/exir/_serialize/scalar_type.fbs`
-to `$EXECUTORCH_ROOT/exir/_serialize/`.
-
-
-## End to End Inference
-
-### Step 1: Build Python APIs for AOT compilation on x64
-
-Python APIs on x64 are required to compile models to Qualcomm AI Engine Direct binary.
-Make sure `buck2` is under a directory in `PATH`.
-
-```bash
-cd $EXECUTORCH_ROOT
-mkdir build_x86_64
-cd build_x86_64
-cmake .. -DEXECUTORCH_BUILD_QNN=ON -DQNN_SDK_ROOT=${QNN_SDK_ROOT}
-cmake --build . -t "PyQnnManagerAdaptor" "PyQnnWrapperAdaptor" -j8
-
-# install Python APIs to correct import path
-# The filename might vary depending on your Python and host version.
-cp -f backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so $EXECUTORCH_ROOT/backends/qualcomm/python
-cp -f backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so $EXECUTORCH_ROOT/backends/qualcomm/python
-```
-
-
-### Step 2: Build `qnn_executor_runner` for Android
-
-`qnn_executor_runner` is an executable running the compiled model.
-
-You might want to ensure the correct `flatc`. `flatc` can be built along with the above step. For example, we can find `flatc` in `build_x86_64/third-party/flatbuffers/`.
-
-We can prepend `$EXECUTORCH_ROOT/build_x86_64/third-party/flatbuffers` to `PATH`. Then below cross-compiling can find the correct flatbuffer compiler.
-
-Commands to build `qnn_executor_runner` for Android:
-
-```bash
-cd $EXECUTORCH_ROOT
-mkdir build_android
-cd build_android
-# build executorch & qnn_executorch_backend
-cmake .. \
-    -DCMAKE_INSTALL_PREFIX=$PWD \
-    -DEXECUTORCH_BUILD_QNN=ON \
-    -DEXECUTORCH_BUILD_SDK=ON \
-    -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
-    -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
-    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
-    -DANDROID_ABI='arm64-v8a' \
-    -DANDROID_NATIVE_API_LEVEL=23 \
-    -B$PWD
-
-cmake --build $PWD -j16 --target install
-
-cmake ../examples/qualcomm \
-    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
-    -DANDROID_ABI='arm64-v8a' \
-    -DANDROID_NATIVE_API_LEVEL=23 \
-    -DCMAKE_PREFIX_PATH="$PWD/lib/cmake/ExecuTorch;$PWD/third-party/gflags;" \
-    -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
-    -Bexamples/qualcomm
-
-cmake --build examples/qualcomm -j16
-```
-**Note:** If you want to build for release, add `-DCMAKE_BUILD_TYPE=Release` to the `cmake` command options.
-
-You can find `qnn_executor_runner` under `build_android/examples/qualcomm/`.
-
-
-### Step 3: Compile a model
-
-```
-python -m examples.qualcomm.scripts.export_example --model_name mv2
-```
-
-Then the generated `mv2.pte` can be run on the device by
-`build_android/backends/qualcomm/qnn_executor_runner` with Qualcomm AI Engine
-Direct backend.
-
-[**Note**] To get proper accuracy, please apply calibrations with representative
-dataset, which could be learnt more from examples under `examples/qualcomm/`.
-
-
-### Step 4: Model Inference
-
-The backend rely on Qualcomm AI Engine Direct SDK libraries.
-
-You might want to follow docs in Qualcomm AI Engine Direct SDK to setup the device environment.
-Or see below for a quick setup for testing:
-
-```bash
-# make sure you have write-permission on below path.
-DEVICE_DIR=/data/local/tmp/executorch_test/
-adb shell "mkdir -p ${DEVICE_DIR}"
-adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtp.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Stub.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV73Stub.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV75Stub.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${DEVICE_DIR}
-```
-
-We also need to indicate dynamic linkers on Android and Hexagon where to find these libraries
-by setting `ADSP_LIBRARY_PATH` and `LD_LIBRARY_PATH`.
-
-So, we can run `qnn_executor_runner` like
-```bash
-adb push mv2.pte ${DEVICE_DIR}
-adb push ${EXECUTORCH_ROOT}/build_android/examples/qualcomm/qnn_executor_runner ${DEVICE_DIR}
-adb shell "cd ${DEVICE_DIR} \
-           && export LD_LIBRARY_PATH=${DEVICE_DIR} \
-           && export ADSP_LIBRARY_PATH=${DEVICE_DIR} \
-           && ./qnn_executor_runner --model_path ./mv2_qnn.pte"
-```
-
-You should see the following result.
-Note that no output file will be generated in this example.
-```
-I 00:00:00.133366 executorch:qnn_executor_runner.cpp:156] Method loaded.
-I 00:00:00.133590 executorch:util.h:104] input already initialized, refilling.
-I 00:00:00.135162 executorch:qnn_executor_runner.cpp:161] Inputs prepared.
-I 00:00:00.136768 executorch:qnn_executor_runner.cpp:278] Model executed successfully.
-[INFO][Qnn ExecuTorch] Destroy Qnn backend parameters
-[INFO][Qnn ExecuTorch] Destroy Qnn context
-[INFO][Qnn ExecuTorch] Destroy Qnn device
-[INFO][Qnn ExecuTorch] Destroy Qnn backend
-```
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index ff52fc61b5..319cc6092c 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -203,14 +203,14 @@ def example_inputs(self):
 
 
 class Conv1dSequential(torch.nn.Module):
-    def __init__(self):
+    def __init__(self, bias=True):
         super().__init__()
         self.first = torch.nn.Conv1d(
             in_channels=1,
             out_channels=3,
             kernel_size=(3),
             padding=1,
-            bias=True,
+            bias=bias,
         )
 
         self.second = torch.nn.Conv1d(
@@ -218,7 +218,7 @@ def __init__(self):
             out_channels=2,
             kernel_size=(3),
             padding=1,
-            bias=True,
+            bias=bias,
         )
 
     def forward(self, x):
@@ -315,21 +315,21 @@ def forward(self, x):
 
 
 class Conv2dSequential(torch.nn.Module):
-    def __init__(self):
+    def __init__(self, bias=True):
         super().__init__()
         self.first = torch.nn.Conv2d(
             in_channels=1,
             out_channels=3,
             kernel_size=(3, 3),
             padding=1,
-            bias=True,
+            bias=bias,
         )
         self.second = torch.nn.Conv2d(
             in_channels=3,
             out_channels=2,
             kernel_size=(3, 3),
             padding=1,
-            bias=True,
+            bias=bias,
         )
 
     def forward(self, x):
@@ -337,14 +337,14 @@ def forward(self, x):
 
 
 class Conv2dSingle(torch.nn.Module):
-    def __init__(self):
+    def __init__(self, bias=True):
         super().__init__()
         self.conv = torch.nn.Conv2d(
             in_channels=1,
             out_channels=3,
             kernel_size=(3, 3),
             padding=1,
-            bias=True,
+            bias=bias,
         )
 
     def forward(self, x):
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 59404c99cf..1f779504bd 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -34,7 +34,7 @@
     generate_qnn_executorch_compiler_spec,
 )
 
-from executorch.examples.qualcomm.scripts.utils import setup_common_args_and_variables
+from executorch.examples.qualcomm.utils import setup_common_args_and_variables
 
 from executorch.backends.qualcomm.tests.models import *  # noqa: F403
 
@@ -109,14 +109,18 @@ def test_qnn_backend_clamp(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_conv1d(self):
-        module = Conv1dSequential()  # noqa: F405
+        modules = [Conv1dSequential(), Conv1dSequential(bias=False)]  # noqa: F405
         sample_input = (torch.randn([1, 1, 3]),)
-        self.lower_module_and_test_output(module, sample_input)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_conv2d(self):
-        module = Conv2dSequential()  # noqa: F405
+        modules = [Conv2dSequential(), Conv2dSequential(bias=False)]  # noqa: F405
         sample_input = (torch.randn([1, 1, 3, 3]),)
-        self.lower_module_and_test_output(module, sample_input)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_element_wise_add(self):
         test_comb = [
@@ -597,12 +601,14 @@ def setUp(self):
         )
 
     def test_qnn_backend_16a4w_conv2d(self):
-        module = Conv2dSingle()  # noqa: F405
+        modules = [Conv2dSingle(), Conv2dSingle(bias=False)]  # noqa: F405
         sample_input = (torch.randn([1, 1, 3, 3]),)
-        module = self.get_qdq_module(
-            module, sample_input, quant_dtype=QuantDtype.use_16a4w
-        )
-        self.lower_module_and_test_output(module, sample_input)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(
+                    module, sample_input, quant_dtype=QuantDtype.use_16a4w
+                )
+                self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_16a4w_linear(self):
         module = Linear()  # noqa: F405
@@ -683,16 +689,20 @@ def test_qnn_backend_clamp(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_conv1d(self):
-        module = Conv1dSequential()  # noqa: F405
+        modules = [Conv1dSequential(), Conv1dSequential(bias=False)]  # noqa: F405
         sample_input = (torch.randn([1, 1, 3]),)
-        module = self.get_qdq_module(module, sample_input)
-        self.lower_module_and_test_output(module, sample_input)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(module, sample_input)
+                self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_conv2d(self):
-        module = Conv2dSequential()  # noqa: F405
+        modules = [Conv2dSequential(), Conv2dSequential(bias=False)]  # noqa: F405
         sample_input = (torch.randn([1, 1, 3, 3]),)
-        module = self.get_qdq_module(module, sample_input)
-        self.lower_module_and_test_output(module, sample_input)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(module, sample_input)
+                self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_element_wise_add(self):
         test_comb = [
@@ -1803,6 +1813,60 @@ def test_squeezenet(self):
                 self.assertGreaterEqual(msg["top_5"], 70)
 
 
+class TestExampleQaihubScript(TestQNN):
+
+    def required_envs(self, conditions=None) -> bool:
+        conditions = [] if conditions is None else conditions
+        return all(
+            [
+                self.executorch_root,
+                self.artifact_dir,
+                *conditions,
+            ]
+        )
+
+    def test_llama2_7b(self):
+        if not self.required_envs():
+            self.skipTest("missing required envs")
+
+        prompt = "Explain the rules of baseball"
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/qaihub_scripts/llama2/qaihub_llama2_7b.py",
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--tokenizer_bin",
+            f"{self.artifact_dir}/tokenizer.bin",
+            "--context_binaries",
+            f"{self.artifact_dir}",
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--prompt",
+            f"{prompt}",
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                model_out = msg["result"]
+                self.assertTrue(model_out.startswith(prompt))
+
+
 class TestExampleScript(TestQNN):
     def required_envs(self, conditions=None) -> bool:
         conditions = [] if conditions is None else conditions
@@ -2085,7 +2149,7 @@ def test_stories_single_llama(self):
 
         cmds = [
             "python",
-            f"{self.executorch_root}/examples/qualcomm/llama2/llama.py",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama2/llama.py",
             "--artifact",
             self.artifact_dir,
             "--build_folder",
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index ef0ac0f202..5fd6d5ad19 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -27,7 +27,7 @@
     QcomChipset,
 )
 from executorch.backends.qualcomm.utils.utils import capture_program
-from executorch.examples.qualcomm.scripts.utils import (
+from executorch.examples.qualcomm.utils import (
     generate_inputs,
     make_output_dir,
     SimpleADB,
@@ -231,25 +231,43 @@ def validate_profile():
                 qnn_sdk = os.environ.get("QNN_SDK_ROOT", None)
                 assert qnn_sdk, "QNN_SDK_ROOT was not found in environment variable"
 
-                build_path = "build_x86_64"
-                cmds = [
-                    # export LD_LIBRARY_PATH to QNN_SDK_ROOT
-                    f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{self.executorch_root}/{build_path}/lib && "
+                build_folder = self.build_folder
+                if os.path.isabs(self.build_folder):
+                    # obey user's opinion
+                    pass
+                else:
+                    # ok, assuming the user give a relative path to cwd
+                    build_folder = os.path.join(os.getcwd(), self.build_folder)
+
+                cmd = [
                     # qnn_executor_runner
-                    f"{self.executorch_root}/{build_path}/examples/qualcomm/qnn_executor_runner",
-                    f"--model_path {pte_fname}",
-                    f"--input_list_path {tmp_dir}/input_list.txt",
-                    f"--output_folder_path {output_dir}",
+                    f"{build_folder}/examples/qualcomm/executor_runner/qnn_executor_runner",
+                    "--model_path",
+                    f"{pte_fname}",
+                    "--input_list_path",
+                    f"{tmp_dir}/input_list.txt",
+                    "--output_folder_path",
+                    f"{output_dir}",
                 ]
 
-                subprocess.run(
-                    " ".join(cmds),
-                    shell=True,
-                    executable="/bin/bash",
-                    capture_output=True,
+                env = dict(os.environ)
+                env["LD_LIBRARY_PATH"] = f"{qnn_sdk}/lib/{target}/:{build_folder}/lib"
+                proc = subprocess.run(
+                    cmd,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.STDOUT,
+                    env=env,
                     cwd=tmp_dir,
                 )
 
+                self.assertEqual(
+                    proc.returncode,
+                    0,
+                    f"The process running qnn_executorch_runner return {proc.returncode}, "
+                    "STDOUT=\n"
+                    f"{proc.stdout.decode('utf-8')}",
+                )
+
                 # Verify the outputs
                 post_process()
                 self._assert_outputs_equal(outputs, ref_outputs)
diff --git a/backends/qualcomm/utils/constants.py b/backends/qualcomm/utils/constants.py
index 58538eb91e..9875c9f5af 100644
--- a/backends/qualcomm/utils/constants.py
+++ b/backends/qualcomm/utils/constants.py
@@ -7,16 +7,23 @@
 # Qualcomm specific key
 
 # constants in backends/qualcomm/passes & backends/qualcomm/builders
+QCOM_AXIS = "axis"
 QCOM_AXIS_ORDER = "axis_order"
 QCOM_BITWIDTH = "bitwidth"
 QCOM_DATA = "data"
+QCOM_DTYPE = "dtype"
 QCOM_ENCODING = "encoding"
 QCOM_INSERTED_PERMUTE = "qnn_permute"
+QCOM_OFFSET = "offset"
 QCOM_QUANTIZED_IO = "q_tensor_io"
 QCOM_QUANT_ATTRS = "quant_attrs"
+QCOM_QUANT_MIN = "quant_min"
+QCOM_QUANT_MAX = "quant_max"
 QCOM_REQUANTIZE = "requantize"
+QCOM_SCALE = "scale"
 QCOM_SCALES = "scales"
 QCOM_SCALE_OFFSET = "scale_offset"
+QCOM_ZERO_POINT = "zero_point"
 QCOM_ZERO_POINTS = "zero_points"
 
 # constants in backends/qualcomm/tests
diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp
index b94e242df6..4d2a854de3 100644
--- a/backends/vulkan/runtime/api/Context.cpp
+++ b/backends/vulkan/runtime/api/Context.cpp
@@ -95,9 +95,9 @@ vkapi::DescriptorSet Context::get_descriptor_set(
       pipeline_layout_cache().retrieve(shader_layout);
 
   vkapi::SpecVarList spec_constants = {
-      SV(local_workgroup_size.data[0u]),
-      SV(local_workgroup_size.data[1u]),
-      SV(local_workgroup_size.data[2u])};
+      SV(local_workgroup_size[0u]),
+      SV(local_workgroup_size[1u]),
+      SV(local_workgroup_size[2u])};
 
   spec_constants.append(additional_constants);
 
@@ -119,11 +119,11 @@ void Context::register_shader_dispatch(
     const utils::uvec3& global_workgroup_size) {
   // Adjust the global workgroup size based on the output tile size
   uint32_t global_wg_w = utils::div_up(
-      global_workgroup_size.data[0u], shader_descriptor.out_tile_size.data[0u]);
+      global_workgroup_size[0u], shader_descriptor.out_tile_size[0u]);
   uint32_t global_wg_h = utils::div_up(
-      global_workgroup_size.data[1u], shader_descriptor.out_tile_size.data[1u]);
+      global_workgroup_size[1u], shader_descriptor.out_tile_size[1u]);
   uint32_t global_wg_d = utils::div_up(
-      global_workgroup_size.data[2u], shader_descriptor.out_tile_size.data[2u]);
+      global_workgroup_size[2u], shader_descriptor.out_tile_size[2u]);
 
   // Submitting a global work group size of 0 is undefined behaviour. If this is
   // detected then submit a single workgroup instead.
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 85656f791a..4443f17544 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -15,18 +15,23 @@ namespace api {
 
 std::vector<int64_t> calculate_strides(
     const std::vector<int64_t>& sizes,
-    const utils::GPUMemoryLayout memory_layout,
-    const bool texel_strides) {
+    const utils::GPUMemoryLayout memory_layout) {
+  // For zero dim tensors
+  if (sizes.size() == 0) {
+    return {1};
+  }
+
   const int64_t dim_offset =
       utils::to_packed_dim_nchw_offset<int64_t>(memory_layout);
-  const int64_t last_dim = sizes.size() - dim_offset;
-  VK_CHECK_COND(last_dim >= 0);
+  int64_t last_dim = sizes.size() - dim_offset;
+  if (last_dim < 0) {
+    last_dim = sizes.size() - 1;
+  }
 
   size_t ndim = sizes.size();
   std::vector<int64_t> strides(ndim);
 
-  const int64_t last_dim_size =
-      texel_strides ? utils::div_up_4(sizes.at(last_dim)) : sizes.at(last_dim);
+  const int64_t last_dim_size = sizes.at(last_dim);
 
   for (int stride_d = ndim - 1; stride_d >= 0; stride_d--) {
     strides.at(stride_d) = 1;
@@ -43,6 +48,23 @@ std::vector<int64_t> calculate_strides(
   return strides;
 }
 
+std::vector<int64_t> unsqueeze_strides(
+    const std::vector<int64_t>& strides,
+    const int64_t numel) {
+  const size_t ndim = strides.size();
+  const size_t ndim_up4 = utils::align_up_4(strides.size());
+  std::vector<int64_t> unsqueezed_strides(ndim_up4);
+  for (int32_t i = 1; i <= ndim; ++i) {
+    int64_t dim_stride = strides.at(ndim - i);
+    unsqueezed_strides.at(ndim_up4 - i) = dim_stride;
+  }
+
+  for (int32_t i = ndim + 1; i <= ndim_up4; ++i) {
+    unsqueezed_strides.at(ndim_up4 - i) = numel;
+  }
+  return unsqueezed_strides;
+}
+
 std::vector<int64_t> calculate_padded_sizes(
     const std::vector<int64_t>& sizes,
     const utils::GPUMemoryLayout memory_layout) {
@@ -108,15 +130,19 @@ vTensor::vTensor(
     const bool allocate_memory)
     : dtype_(dtype),
       memory_layout_(memory_layout),
-      // Calculate sizes and strides
+      // Calculate tensor size metadata
       sizes_(sizes.begin(), sizes.end()),
+      strides_(calculate_strides(sizes, memory_layout_)),
+      numel_(utils::multiply_integers(sizes_)),
       padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
+      unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
+      padded_numel_(utils::multiply_integers(padded_sizes_)),
       texture_limits_{{0, 0, 0}},
       // Utility Uniform Buffers that can be passed to shaders as arguments
       sizes_uniform_(),
+      strides_uniform_(),
+      numel_uniform_(),
       texture_limits_uniform_(),
-      texel_strides_uniform_(),
-      ntexels_uniform_(),
       // Construct Tensor storage
       storage_(
           context,
@@ -127,9 +153,9 @@ vTensor::vTensor(
           allocate_memory) {
   if (storage_type != utils::kBuffer) {
     texture_limits_.limits = utils::ivec3{
-        utils::safe_downcast<int32_t>(storage_.image_extents_.data[0]),
-        utils::safe_downcast<int32_t>(storage_.image_extents_.data[1]),
-        utils::safe_downcast<int32_t>(storage_.image_extents_.data[2])};
+        utils::safe_downcast<int32_t>(storage_.image_extents_[0]),
+        utils::safe_downcast<int32_t>(storage_.image_extents_[1]),
+        utils::safe_downcast<int32_t>(storage_.image_extents_[2])};
   }
 
   if (dtype == vkapi::kHalf) {
@@ -178,6 +204,14 @@ const vkapi::BufferBindInfo vTensor::sizes_ubo() {
   return vkapi::BufferBindInfo(sizes_uniform_.buffer());
 }
 
+const vkapi::BufferBindInfo vTensor::strides_ubo() {
+  if (!strides_uniform_.buffer()) {
+    strides_uniform_ = ParamsBuffer(
+        storage_.context_, utils::make_whcn_ivec4(unsqueezed_strides_));
+  }
+  return vkapi::BufferBindInfo(strides_uniform_.buffer());
+}
+
 const vkapi::BufferBindInfo vTensor::texture_limits_ubo() {
   if (!texture_limits_uniform_.buffer()) {
     texture_limits_uniform_ = ParamsBuffer(storage_.context_, texture_limits_);
@@ -185,21 +219,24 @@ const vkapi::BufferBindInfo vTensor::texture_limits_ubo() {
   return vkapi::BufferBindInfo(texture_limits_uniform_.buffer());
 }
 
-const vkapi::BufferBindInfo vTensor::texel_strides_ubo() {
-  if (!texel_strides_uniform_.buffer()) {
-    texel_strides_uniform_ = ParamsBuffer(
-        storage_.context_,
-        utils::make_whcn_ivec4(
-            calculate_strides(padded_sizes_, memory_layout_)));
+const vkapi::BufferBindInfo vTensor::numel_ubo() {
+  if (!numel_uniform_.buffer()) {
+    numel_uniform_ = ParamsBuffer(storage_.context_, numel_);
   }
-  return vkapi::BufferBindInfo(texel_strides_uniform_.buffer());
+  return vkapi::BufferBindInfo(numel_uniform_.buffer());
 }
 
-const vkapi::BufferBindInfo vTensor::ntexels_ubo() {
-  if (!ntexels_uniform_.buffer()) {
-    ntexels_uniform_ = ParamsBuffer(storage_.context_, texel_numel());
+size_t vTensor::staging_buffer_numel() const {
+  const bool is_int8 = dtype_ == vkapi::kChar;
+  const bool int8_supported =
+      storage_.context_->adapter_ptr()->has_full_int8_buffers_support();
+  if (is_int8 && !int8_supported) {
+    return utils::align_up_4(numel_);
+  }
+  if (storage_type() == utils::kBuffer) {
+    return numel_;
   }
-  return vkapi::BufferBindInfo(ntexels_uniform_.buffer());
+  return padded_numel_;
 }
 
 VmaAllocationCreateInfo vTensor::get_allocation_create_info() const {
@@ -238,7 +275,12 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) {
 
 void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
   sizes_ = new_sizes;
+  strides_ = calculate_strides(new_sizes, memory_layout_);
+  numel_ = utils::multiply_integers(sizes_);
+
   padded_sizes_ = calculate_padded_sizes(sizes_, memory_layout_);
+  unsqueezed_strides_ = unsqueeze_strides(strides_, numel_);
+  padded_numel_ = utils::multiply_integers(padded_sizes_);
 
   // Calculate the extents of the image texture that would have been required
   // for a tensor of the new sizes.
@@ -247,9 +289,9 @@ void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
 
   // Update the texture limits to reflect the new virtual extents.
   texture_limits_.limits = utils::ivec3{
-      utils::safe_downcast<int32_t>(virtual_extents.data[0]),
-      utils::safe_downcast<int32_t>(virtual_extents.data[1]),
-      utils::safe_downcast<int32_t>(virtual_extents.data[2])};
+      utils::safe_downcast<int32_t>(virtual_extents[0]),
+      utils::safe_downcast<int32_t>(virtual_extents[1]),
+      utils::safe_downcast<int32_t>(virtual_extents[2])};
 
   if (sizes_uniform_.buffer()) {
     sizes_uniform_.update(utils::make_whcn_ivec4(sizes_));
@@ -257,12 +299,11 @@ void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
   if (texture_limits_uniform_.buffer()) {
     texture_limits_uniform_.update(texture_limits_);
   }
-  if (texel_strides_uniform_.buffer()) {
-    texel_strides_uniform_.update(utils::make_whcn_ivec4(
-        calculate_strides(padded_sizes_, memory_layout_)));
+  if (strides_uniform_.buffer()) {
+    strides_uniform_.update(utils::make_whcn_ivec4(unsqueezed_strides_));
   }
-  if (ntexels_uniform_.buffer()) {
-    ntexels_uniform_.update(texel_numel());
+  if (numel_uniform_.buffer()) {
+    numel_uniform_.update(numel_);
   }
 }
 
@@ -281,11 +322,9 @@ void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
     utils::uvec3 virtual_extents =
         calculate_image_extents(padded_sizes_, memory_layout_);
 
-    bool valid_resize = virtual_extents.data[0] <= image_extents().data[0];
-    valid_resize =
-        valid_resize && virtual_extents.data[1] <= image_extents().data[1];
-    valid_resize =
-        valid_resize && virtual_extents.data[2] <= image_extents().data[2];
+    bool valid_resize = virtual_extents[0] <= image_extents()[0];
+    valid_resize = valid_resize && virtual_extents[1] <= image_extents()[1];
+    valid_resize = valid_resize && virtual_extents[2] <= image_extents()[2];
 
     VK_CHECK_COND(
         valid_resize,
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index 6ec5ba5b09..b1a02a6d2e 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -24,16 +24,14 @@ namespace api {
  * of the tensor in NCHW dimension order. The GPU memory layout will be used to
  * determine which dimension is packed along a texel; that dimension will be
  * used as the "fasted moving" dimension with a stride of 1.
- *
- * If texel_strides is true, then the strides will be calculated for a texel
- * buffer (i.e. the size of the packed dimension will be modified by the
- * div_up_4 function before being used in calculations). Otherwise, the strides
- * will be calculated assuming a contiguous scalar buffer.
  */
 std::vector<int64_t> calculate_strides(
     const std::vector<int64_t>& sizes,
-    const utils::GPUMemoryLayout memory_layout,
-    const bool texel_strides = true);
+    const utils::GPUMemoryLayout memory_layout);
+
+std::vector<int64_t> unsqueeze_strides(
+    const std::vector<int64_t>& strides,
+    const int64_t numel);
 
 /*
  * When stored on the GPU, tensor data is stored using texels (i.e. a vector of
@@ -169,11 +167,24 @@ class vTensor final {
 
   // sizes of the tensor in NCHW dimension order
   std::vector<int64_t> sizes_;
+  // strides of the tensor in NCHW dimension order
+  std::vector<int64_t> strides_;
+  // Contains the number of elements in the tensor according to the canonical
+  // sizes.
+  size_t numel_;
   // padded sizes of the tensor in NCHW dimension order. See the
-  // calculate_padded_sizes() function for more context.
+  // calculate_padded_sizes() function for more context. Note that padded sizes
+  // are only used for texture storage, and not for buffer storage.
   std::vector<int64_t> padded_sizes_;
+  // Contains the strides of the tensor, with the dimensionality padded to the
+  // nearest multiple of 4. Unsqueezed dims will have a stride of int32_t max.
+  std::vector<int64_t> unsqueezed_strides_;
+  // Contains the number of elements in the tensor according to the padded
+  // sizes.
+  size_t padded_numel_;
   // Contains the "virtual" texture extents of the tensor. See the
-  // texture_limits() function for more context.
+  // texture_limits() function for more context. Note that the texture limits
+  // are only relevant for texture storage, and not for buffer storage.
   TextureLimits texture_limits_;
 
   /*
@@ -186,9 +197,9 @@ class vTensor final {
    * context about the data contained in each buffer.
    */
   ParamsBuffer sizes_uniform_;
+  ParamsBuffer strides_uniform_;
+  ParamsBuffer numel_uniform_;
   ParamsBuffer texture_limits_uniform_;
-  ParamsBuffer texel_strides_uniform_;
-  ParamsBuffer ntexels_uniform_;
 
   vTensorStorage storage_;
 
@@ -266,6 +277,14 @@ class vTensor final {
     return sizes_.size();
   }
 
+  inline const std::vector<int64_t>& strides() const {
+    return strides_;
+  }
+
+  inline const std::vector<int64_t>& unsqueezed_strides() const {
+    return unsqueezed_strides_;
+  }
+
   /*
    * Returns a GPU buffer containing the sizes of the tensor in WHCN order.
    * Note that dimensions that are not present in the tensor's sizes are set to
@@ -273,6 +292,14 @@ class vTensor final {
    */
   const vkapi::BufferBindInfo sizes_ubo();
 
+  /*
+   * Returns a GPU buffer containing the strides of the tensor in WHCN order.
+   * Note that the strides are extended to a dimensionality that is a multiple
+   * of 4, thus dimensions that are not present in the tensor's sizes are set to
+   * have a stride equal to the stride of the "slowest moving" dimension.
+   */
+  const vkapi::BufferBindInfo strides_ubo();
+
   /*
    * Returns a GPU buffer containing the virtual image extents of the tensor.
    * Since a tensor can be resized with the virtual_resize() function, this
@@ -285,22 +312,16 @@ class vTensor final {
   const vkapi::BufferBindInfo texture_limits_ubo();
 
   /*
-   * Returns the strides of the texel buffer used to store the tensor, as
-   * calculated by calculate_strides().
+   * Returns the number of elements in the buffer used to store the tensor.
    */
-  const vkapi::BufferBindInfo texel_strides_ubo();
-
-  /*
-   * Returns the number of texels in the texel buffer used to store the tensor.
-   */
-  const vkapi::BufferBindInfo ntexels_ubo();
+  const vkapi::BufferBindInfo numel_ubo();
 
   inline const utils::ivec3 texture_limits() const {
     return texture_limits_.limits;
   }
 
   inline size_t numel() const {
-    return utils::multiply_integers(sizes());
+    return numel_;
   }
 
   inline size_t nbytes() const {
@@ -310,23 +331,14 @@ class vTensor final {
   /*
    * Returns numel but based on padded_sizes_ instead of sizes_
    */
-  inline size_t gpu_numel() const {
-    return utils::multiply_integers(padded_sizes_);
+  inline size_t padded_numel() const {
+    return padded_numel_;
   }
 
-  /*
-   * Returns the number of texels in the image texture or texel buffer used to
-   * store the tensor's data.
-   */
-  inline int32_t texel_numel() const {
-    return utils::safe_downcast<int32_t>(gpu_numel() / 4);
-  }
+  size_t staging_buffer_numel() const;
 
-  /*
-   * Return nbytes but based on padded_sizes_ instead of sizes_
-   */
-  inline VkDeviceSize gpu_nbytes() const {
-    return element_size(dtype()) * gpu_numel();
+  inline size_t staging_buffer_nbytes() const {
+    return element_size(dtype()) * staging_buffer_numel();
   }
 
   /*
diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py
index c734ed395e..f4ba98b31f 100644
--- a/backends/vulkan/runtime/gen_vulkan_spv.py
+++ b/backends/vulkan/runtime/gen_vulkan_spv.py
@@ -231,7 +231,7 @@ def layout_declare_tensor(
     var_name: str,
     dtype: str,
     storage_type: str,
-    is_scalar_array: bool = False,
+    is_scalar_array: bool = True,
     precision: str = "PRECISION",
 ) -> str:
     assert storage_type.lower() in ["buffer", "texture3d", "texture2d"]
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index fb2c379c1b..a773a33f06 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -248,8 +248,10 @@ ValueRef ComputeGraph::set_input_tensor(
     const bool use_staging) {
   if (use_staging) {
     vkapi::ScalarType dtype = get_tensor(idx)->dtype();
-    size_t gpu_numel = get_tensor(idx)->gpu_numel();
-    ValueRef staging_idx = add_staging(dtype, gpu_numel);
+    // For texture storage, the buffer size needs to account for the zero
+    // padding applied by unused texel elements.
+    size_t buf_numel = get_tensor(idx)->staging_buffer_numel();
+    ValueRef staging_idx = add_staging(dtype, buf_numel);
     add_staging_to_tensor_node(*this, staging_idx, idx);
     inputs_.push_back({idx, staging_idx});
     return staging_idx;
@@ -263,12 +265,14 @@ ValueRef ComputeGraph::set_output_tensor(
     const bool use_staging) {
   if (use_staging) {
     vkapi::ScalarType dtype = get_tensor(idx)->dtype();
-    size_t gpu_numel = get_tensor(idx)->gpu_numel();
-    ValueRef staging_idx = add_staging(dtype, gpu_numel);
+    // For texture storage, the buffer size needs to account for the zero
+    // padding applied by unused texel elements.
+    size_t buf_numel = get_tensor(idx)->staging_buffer_numel();
+    ValueRef staging_idx = add_staging(dtype, buf_numel);
     // We only run this when the tensor is non-empty.  When the underlying
-    // tensor is empty (e.g. gpu_numel == 0), we do not allocate a VkImage to
+    // tensor is empty (e.g. padded_numel == 0), we do not allocate a VkImage to
     // tensor, we will not be able to bind the node for execution.
-    if (gpu_numel > 0) {
+    if (buf_numel > 0) {
       add_tensor_to_staging_node(*this, idx, staging_idx);
     }
     outputs_.push_back({idx, staging_idx});
@@ -314,7 +318,7 @@ void ComputeGraph::update_descriptor_counts(
 
 utils::uvec3 ComputeGraph::create_global_wg_size(const ValueRef idx) {
   if (is_buffer_storage(idx)) {
-    return {uint32_t(texel_numel_of(idx)), 1u, 1u};
+    return {uint32_t(numel_of(idx)), 1u, 1u};
   }
   return image_extents_of(idx);
 }
@@ -327,19 +331,19 @@ utils::uvec3 ComputeGraph::create_local_wg_size(
 
   utils::uvec3 local_group_size = {4, 4, 4};
 
-  if (global_wg_size.data[2u] == 1) {
-    if (global_wg_size.data[1u] == 1) {
-      local_group_size.data[0u] = 64;
-      local_group_size.data[1u] = 1;
-      local_group_size.data[2u] = 1;
-    } else if (global_wg_size.data[1u] < 8) {
-      local_group_size.data[0u] = 16;
-      local_group_size.data[1u] = 4;
-      local_group_size.data[2u] = 1;
+  if (global_wg_size[2u] == 1) {
+    if (global_wg_size[1u] == 1) {
+      local_group_size[0u] = 64;
+      local_group_size[1u] = 1;
+      local_group_size[2u] = 1;
+    } else if (global_wg_size[1u] < 8) {
+      local_group_size[0u] = 16;
+      local_group_size[1u] = 4;
+      local_group_size[2u] = 1;
     } else {
-      local_group_size.data[0u] = 8;
-      local_group_size.data[1u] = 8;
-      local_group_size.data[2u] = 1;
+      local_group_size[0u] = 8;
+      local_group_size[1u] = 8;
+      local_group_size[2u] = 1;
     }
   }
   return local_group_size;
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 898a856291..e09cd69345 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -192,8 +192,8 @@ class ComputeGraph final {
     return values_.at(idx).toConstTensor().image_extents();
   }
 
-  inline int32_t texel_numel_of(const ValueRef idx) const {
-    return values_.at(idx).toConstTensor().texel_numel();
+  inline int32_t numel_of(const ValueRef idx) const {
+    return values_.at(idx).toConstTensor().numel();
   }
 
   inline utils::StorageType storage_type_of(const ValueRef idx) const {
@@ -216,16 +216,16 @@ class ComputeGraph final {
     return values_.at(idx).toTensor().sizes_ubo();
   }
 
-  inline vkapi::BufferBindInfo texture_limits_ubo(const ValueRef idx) {
-    return values_.at(idx).toTensor().texture_limits_ubo();
+  inline vkapi::BufferBindInfo strides_ubo(const ValueRef idx) {
+    return values_.at(idx).toTensor().strides_ubo();
   }
 
-  inline vkapi::BufferBindInfo texel_strides_ubo(const ValueRef idx) {
-    return values_.at(idx).toTensor().texel_strides_ubo();
+  inline vkapi::BufferBindInfo numel_ubo(const ValueRef idx) {
+    return values_.at(idx).toTensor().numel_ubo();
   }
 
-  inline vkapi::BufferBindInfo ntexels_ubo(const ValueRef idx) {
-    return values_.at(idx).toTensor().ntexels_ubo();
+  inline vkapi::BufferBindInfo texture_limits_ubo(const ValueRef idx) {
+    return values_.at(idx).toTensor().texture_limits_ubo();
   }
 
   //
diff --git a/backends/vulkan/runtime/graph/ops/glsl/activations.h b/backends/vulkan/runtime/graph/ops/glsl/activations.h
new file mode 100644
index 0000000000..c5ee3b2085
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/activations.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+float hardswish(float x) {
+  if (x <= -3) {
+    return 0;
+  } else if (x >= 3) {
+    return x;
+  } else {
+    return x * (x + 3) / 6;
+  }
+}
+
+vec4 hardswish(vec4 tex) {
+  return vec4(
+      hardswish(tex.x), hardswish(tex.y), hardswish(tex.z), hardswish(tex.z));
+}
+
+float hardshrink(float x, float lambda, float neg_lambda) {
+  return x * (float(x > lambda) + float(x < neg_lambda));
+}
+
+vec4 hardshrink(vec4 tex, float lambda, float neg_lambda) {
+  return tex *
+      (vec4(greaterThan(tex, vec4(lambda))) +
+       vec4(lessThan(tex, vec4(neg_lambda))));
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.glsl
new file mode 100644
index 0000000000..9d4b18f0d1
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.glsl
@@ -0,0 +1,23 @@
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+${layout_declare_tensor(0, "w", "out_buf", DTYPE, STORAGE)}
+${layout_declare_tensor(1, "r", "in_buf", DTYPE, STORAGE)}
+${layout_declare_ubo(2, "int", "numel")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  int tid = int(gl_GlobalInvocationID.x);
+  if (tid >= numel) {
+    return;
+  }
+  out_buf[tid] = in_buf[tid];
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.yaml
new file mode 100644
index 0000000000..8ea4cbe561
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.yaml
@@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+buffer_to_buffer:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: buffer
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+      - VALUE: int
+      - VALUE: int8
+  shader_variants:
+    - NAME: buffer_to_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl
new file mode 100644
index 0000000000..58796879e8
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl
@@ -0,0 +1,35 @@
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+
+#include "indexing_utils.h"
+
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+${layout_declare_tensor(0, "w", "nchw_buf", DTYPE, STORAGE)}
+${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_ubo(2, "ivec4", "in_sizes")}
+${layout_declare_ubo(3, "ivec4", "in_strides")}
+${layout_declare_ubo(4, "int", "numel")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+// This constant is unused in this shader but is kept so that the signature is
+// consistent with image_to_nchw.
+layout(constant_id = 3) const int UNUSED_packed_dim = W_DIM;
+
+void main() {
+  int out_id = int(gl_GlobalInvocationID.x);
+  if (out_id >= numel) {
+    return;
+  }
+
+  ivec4 t_in_idx = from_nchw_buffer_i(out_id, in_sizes);
+  const int in_id = to_buffer_id(t_in_idx, in_strides);
+
+  nchw_buf[out_id] = t_in[in_id];
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml
new file mode 100644
index 0000000000..653bda9ccc
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml
@@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+buffer_to_nchw:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: buffer
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+      - VALUE: int
+      - VALUE: int8
+  shader_variants:
+    - NAME: buffer_to_nchw
diff --git a/backends/vulkan/runtime/graph/ops/glsl/tensor_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
similarity index 73%
rename from backends/vulkan/runtime/graph/ops/glsl/tensor_to_nchw.glsl
rename to backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
index d545e5d86e..b51d5a3f6e 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/tensor_to_nchw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
@@ -21,12 +21,9 @@ ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
 
-${layout_declare_tensor(0, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_buffer(1, "w", "nchw_out", DTYPE)}
+${layout_declare_buffer(0, "w", "nchw_out", DTYPE)}
+${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
 ${layout_declare_ubo(2, "ivec4", "sizes")}
-$if STORAGE == "buffer":
-  ${layout_declare_ubo(3, "ivec4", "gpu_strides")}
-  ${layout_declare_ubo(4, "int", "ntexels")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -52,22 +49,6 @@ void write_out_texel(VEC4_T texel, ivec4 tensor_idx) {
   }
 }
 
-#ifdef USING_BUFFER
-
-void main() {
-  const int t_id = int(gl_GlobalInvocationID.x);
-  if (t_id >= ntexels) {
-    return;
-  }
-
-  const VEC4_T intex = t_in[t_id];
-  ivec4 tensor_idx = to_tensor_idx(t_id, gpu_strides, packed_dim);
-  tensor_idx[packed_dim] *= 4;
-  write_out_texel(intex, tensor_idx);
-}
-
-#else // USING_TEXTURE
-
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
   const ivec4 tensor_idx = to_tensor_idx(pos, sizes, packed_dim);
@@ -79,5 +60,3 @@ void main() {
   const VEC4_T intex = load_texel(t_in, pos);
   write_out_texel(intex, tensor_idx);
 }
-
-#endif
diff --git a/backends/vulkan/runtime/graph/ops/glsl/tensor_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml
similarity index 88%
rename from backends/vulkan/runtime/graph/ops/glsl/tensor_to_nchw.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml
index 93a261e1ee..0898e75110 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/tensor_to_nchw.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-tensor_to_nchw:
+image_to_nchw:
   parameter_names_with_default_values:
     DTYPE: float
     STORAGE: texture3d
@@ -17,6 +17,5 @@ tensor_to_nchw:
     STORAGE:
       - VALUE: texture3d
       - VALUE: texture2d
-      - VALUE: buffer
   shader_variants:
-    - NAME: tensor_to_nchw
+    - NAME: image_to_nchw
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
index d3264e43a2..21eadff0b3 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
+++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -41,6 +41,21 @@
  */
 #define alignup4(x) ((x + 3) & -4)
 
+/*
+ * Input: (W, H, C, N) strides of a tensor
+ * Returns: the WHCN index of the fastest moving dimension
+ */
+int find_packed_dim(const ivec4 strides) {
+  int packed_dim = 0;
+  for (int i = 0; i <= 3; i++) {
+    if (strides[i] == 1) {
+      packed_dim = i;
+      break;
+    }
+  }
+  return packed_dim;
+}
+
 //
 // (w, h, c, n) Tensor Index <-> Contiguous Buffer Index Conversion
 //
@@ -74,27 +89,49 @@ ivec4 from_nchw_buffer_i(int buf_i, ivec4 sizes) {
       (buf_i / (sizes.x * sizes.y * sizes.z)));
 }
 
+int to_nchw_buffer_i(const ivec4 tensor_idx, const ivec4 sizes) {
+  return tensor_idx.w * sizes.x * sizes.y * sizes.z +
+      tensor_idx.z * sizes.x * sizes.y + tensor_idx.y * sizes.x + tensor_idx.x;
+}
+
 /*
  * Input: Texel buffer index, (W, H, C, N) strides of a tensor, which dim is
  *        packed along a texel
- * Returns: The (x, y, z, n) texel position corresponding to the first element
- *          of the texel at the specified buffer index
+ * Returns: The (w, h, c, n) tensor index corresponding to the buffer element
  */
-ivec4 to_tensor_idx(int buf_i, ivec4 strides, int packed_dim) {
+ivec4 to_tensor_idx(int buffer_id, const ivec4 strides, const int packed_dim) {
   ivec4 idx;
   for (int i = 3; i >= 0; i--) {
     if (i != packed_dim) {
-      idx[i] = buf_i / strides[i];
-      buf_i %= strides[i];
+      idx[i] = buffer_id / strides[i];
+      buffer_id %= strides[i];
     }
   }
-  idx[packed_dim] = buf_i;
+  idx[packed_dim] = buffer_id;
   return idx;
 }
 
-int to_texel_idx(const ivec4 texel_pos, ivec4 strides) {
-  return texel_pos.x * strides.x + texel_pos.y * strides.y +
-      texel_pos.z * strides.z + texel_pos.w * strides.w;
+/*
+ * Input: Texel buffer index, (W, H, C, N) strides of a tensor
+ * Returns: The (w, h, c, n) tensor index corresponding to the buffer element
+ *
+ * This is a convenience overload of the above function. If the packed dim is
+ * not known, it can be found by finding the first dimension with a stride of 1.
+ * However, this process adds some overhead, so if performance is a concern then
+ * the above function should be used instead so that the packed dim is provided.
+ */
+ivec4 to_tensor_idx(int buffer_id, const ivec4 strides) {
+  int packed_dim = find_packed_dim(strides);
+  return to_tensor_idx(buffer_id, strides, packed_dim);
+}
+
+/*
+ * Input: (w, h, c, n) tensor index, (W, H, C, N) strides of the tensor buffer
+ * Returns: the buffer index corresponding to the specified tensor index
+ */
+int to_buffer_id(const ivec4 tensor_idx, ivec4 strides) {
+  return tensor_idx.x * strides.x + tensor_idx.y * strides.y +
+      tensor_idx.z * strides.z + tensor_idx.w * strides.w;
 }
 
 //
diff --git a/backends/vulkan/runtime/graph/ops/glsl/int8_tensor_to_nchw_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl
similarity index 73%
rename from backends/vulkan/runtime/graph/ops/glsl/int8_tensor_to_nchw_noint8.glsl
rename to backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl
index 21290d0ce8..b1e3a0abdf 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/int8_tensor_to_nchw_noint8.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl
@@ -16,10 +16,10 @@ layout(std430) buffer;
 
 #extension GL_EXT_control_flow_attributes : require
 
-${layout_declare_tensor(0, "r", "t_in", "int8", "texture3d")}
-${layout_declare_buffer(1, "w", "nchw_out", "int")}
+${layout_declare_buffer(0, "w", "nchw_out", "int")}
+${layout_declare_tensor(1, "r", "t_in", "int8", "texture3d")}
 ${layout_declare_ubo(2, "ivec4", "tensor_sizes")}
-${layout_declare_ubo(3, "int", "out_ntexels")}
+${layout_declare_ubo(3, "int", "out_numel")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -27,7 +27,12 @@ layout(constant_id = 3) const int packed_dim = C_DIM;
 
 void main() {
   const int out_buf_idx = int(gl_GlobalInvocationID.x);
-  if (out_buf_idx >= out_ntexels) {
+  // On the CPU, the number of elements is determined based on a buffer of int8
+  // elements. However, on the GPU, since the int8 data type is not supported
+  // each group of 4 elements is interepreted as 1 int32 element. Thus each
+  // thread is actually writing to 4 output elements from the perspective of the
+  // CPU.
+  if (out_buf_idx * 4 >= out_numel) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
new file mode 100644
index 0000000000..d861972f93
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
@@ -0,0 +1,35 @@
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+
+#include "indexing_utils.h"
+
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(1, "r", "nchw_in", DTYPE, STORAGE)}
+${layout_declare_ubo(2, "ivec4", "out_sizes")}
+${layout_declare_ubo(3, "ivec4", "out_strides")}
+${layout_declare_ubo(4, "int", "numel")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+// This constant is unused in this shader but is kept so that the signature is
+// consistent with nchw_to_image.
+layout(constant_id = 3) const int UNUSED_packed_dim = W_DIM;
+
+void main() {
+  int out_id = int(gl_GlobalInvocationID.x);
+  if (out_id >= numel) {
+    return;
+  }
+
+  ivec4 out_idx = to_tensor_idx(out_id, out_strides);
+  const int in_id = to_nchw_buffer_i(out_idx, out_sizes);
+
+  t_out[out_id] = nchw_in[in_id];
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml
new file mode 100644
index 0000000000..6292ef9333
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml
@@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+nchw_to_buffer:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: buffer
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+      - VALUE: int
+      - VALUE: int8
+  shader_variants:
+    - NAME: nchw_to_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_tensor.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
similarity index 79%
rename from backends/vulkan/runtime/graph/ops/glsl/nchw_to_tensor.glsl
rename to backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
index c218482b09..abe9390480 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_tensor.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
@@ -24,9 +24,6 @@ layout(std430) buffer;
 ${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_buffer(1, "r", "nchw_in", DTYPE)}
 ${layout_declare_ubo(2, "ivec4", "sizes")}
-$if STORAGE == "buffer":
-  ${layout_declare_ubo(3, "ivec4", "gpu_strides")}
-  ${layout_declare_ubo(4, "int", "ntexels")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -54,21 +51,6 @@ VEC4_T read_texel(ivec4 tensor_idx) {
   return texel;
 }
 
-#ifdef USING_BUFFER
-
-void main() {
-  const int t_id = int(gl_GlobalInvocationID.x);
-  if (t_id >= ntexels) {
-    return;
-  }
-
-  ivec4 tensor_idx = to_tensor_idx(t_id, gpu_strides, packed_dim);
-  tensor_idx[packed_dim] *= 4;
-  t_out[t_id] = read_texel(tensor_idx);
-}
-
-#else // USING_TEXTURE
-
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
   const ivec4 tensor_idx = to_tensor_idx(pos, sizes, packed_dim);
@@ -78,5 +60,3 @@ void main() {
 
   write_texel(t_out, pos, read_texel(tensor_idx));
 }
-
-#endif // USING_BUFFER
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_tensor.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml
similarity index 88%
rename from backends/vulkan/runtime/graph/ops/glsl/nchw_to_tensor.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml
index 96fe55dfb4..2bf85a7492 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_tensor.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-nchw_to_tensor:
+nchw_to_image:
   parameter_names_with_default_values:
     STORAGE: texture3d
     DTYPE: float
@@ -17,6 +17,5 @@ nchw_to_tensor:
     STORAGE:
       - VALUE: texture3d
       - VALUE: texture2d
-      - VALUE: buffer
   shader_variants:
-    - NAME: nchw_to_tensor
+    - NAME: nchw_to_image
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_tensor_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl
similarity index 100%
rename from backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_tensor_noint8.glsl
rename to backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
index 37988f21ec..7557a7b0c3 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
@@ -11,6 +11,7 @@
 #define PRECISION ${PRECISION}
 
 #define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
+#define FLOAT_T ${buffer_scalar_type(DTYPE)}
 
 ${define_active_storage_type(STORAGE)}
 
@@ -28,9 +29,9 @@ ${layout_declare_tensor(3, "r", "t_scales", DTYPE, STORAGE)}
 
 $if STORAGE == "buffer":
   ${layout_declare_ubo(4, "ivec4", "out_sizes")}
-  ${layout_declare_ubo(5, "int", "ntexels")}
-  ${layout_declare_ubo(6, "ivec4", "mat1_sizes")}
-  ${layout_declare_ubo(7, "ivec4", "out_strides")}
+  ${layout_declare_ubo(5, "ivec4", "out_strides")}
+  ${layout_declare_ubo(6, "int", "out_numel")}
+  ${layout_declare_ubo(7, "ivec4", "mat1_sizes")}
   ${layout_declare_ubo(8, "ivec4", "mat1_strides")}
   ${layout_declare_ubo(9, "ivec4", "qmat2_strides")}
   ${layout_declare_ubo(10, "ivec4", "scales_strides")}
@@ -49,14 +50,13 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const int t_id = int(gl_GlobalInvocationID.x);
-  if (t_id >= ntexels) {
+  if (t_id >= out_numel) {
     return;
   }
 
-  const ivec4 out_pos = to_tensor_idx(t_id, out_strides, 0);
+  const ivec4 out_idx = to_tensor_idx(t_id, out_strides, 0);
 
-  VEC4_T outtex = q_8w_linear(out_pos, mat1_sizes.x);
-  write_texel(t_out, t_id, outtex);
+  t_out[t_id] = q_8w_linear(out_idx, mat1_sizes.x);
 }
 
 #else // USING_TEXTURE
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_linear.h b/backends/vulkan/runtime/graph/ops/glsl/q_linear.h
index c1411376ad..f6de1e6dcf 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/q_linear.h
+++ b/backends/vulkan/runtime/graph/ops/glsl/q_linear.h
@@ -16,36 +16,33 @@
 
 #ifdef USING_BUFFER
 
-VEC4_T q_8w_linear(const ivec4 out_pos, const int K) {
-  const VEC4_T scales = load_texel(t_scales, out_pos.x);
+#ifndef FLOAT_T
+#define FLOAT_T float
+#endif
 
-  VEC4_T outtex = VEC4_T(0);
+FLOAT_T q_8w_linear(const ivec4 out_idx, const int K) {
+  const FLOAT_T scale = t_scales[out_idx.x];
 
-  // Initial mat1 pos will be (0, out_pos.y, out_pos.z, 0)
-  int mat1_tid = out_pos.y * mat1_strides.y + out_pos.z * qmat2_strides.z;
-  // Initial qmat2 pos wil be (0, out_pos.x * 4, 0, 0)
-  int qmat2_tid = out_pos.x * 4 * qmat2_strides.y;
+  FLOAT_T outval = FLOAT_T(0.0);
 
-  // TODO(ssjia): optimize memory access pattern by traversing K in inner loop
-  for (int i = 0; i < K; i += 4) {
-    const VEC4_T mat1_tex = load_texel(t_mat1, mat1_tid);
+  // Initial mat1 tensor idx will be (0, out_idx.y, out_idx.z, 0)
+  int mat1_offset = out_idx.y * mat1_strides.y + out_idx.z * qmat2_strides.z;
+  // Initial qmat2 tensor idx wil be (0, out_idx.x, 0, 0); note that the qmat2
+  // tensor is transposed
+  int qmat2_offset = out_idx.x * qmat2_strides.y;
 
-    const VEC4_T sums = VEC4_T(
-        dot(mat1_tex, load_texel(t_qmat2, qmat2_tid) * scales.x),
-        dot(mat1_tex,
-            load_texel(t_qmat2, qmat2_tid + qmat2_strides.y) * scales.y),
-        dot(mat1_tex,
-            load_texel(t_qmat2, qmat2_tid + qmat2_strides.y * 2) * scales.z),
-        dot(mat1_tex,
-            load_texel(t_qmat2, qmat2_tid + qmat2_strides.y * 3) * scales.w));
+  // TODO(ssjia): optimize memory access pattern by traversing K in inner loop
+  for (int i = 0; i < K; i++) {
+    const FLOAT_T mat1_val = t_mat1[mat1_offset];
+    const FLOAT_T mat2_val = t_qmat2[qmat2_offset] * scale;
 
-    outtex += sums;
+    outval += mat1_val * mat2_val;
 
-    mat1_tid++;
-    qmat2_tid++;
+    mat1_offset++;
+    qmat2_offset++;
   }
 
-  return outtex;
+  return outval;
 }
 
 #else // USING_TEXTURE
diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl
index 0cad62d38c..b645905939 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl
@@ -11,6 +11,7 @@
 #define PRECISION ${PRECISION}
 
 #define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
+#define T ${buffer_scalar_type(DTYPE)}
 
 #define op(X, A, B) ${OPERATOR}
 
@@ -18,46 +19,33 @@ ${define_active_storage_type(STORAGE)}
 
 #include "indexing_utils.h"
 
-$if DTYPE == "half" and STORAGE == "buffer":
-  #extension GL_EXT_shader_16bit_storage : require
-  #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
 
 ${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
 $if STORAGE == "buffer":
-  ${layout_declare_ubo(2, "int", "ntexels")}
+  ${layout_declare_ubo(2, "int", "numel")}
 $else:
   ${layout_declare_ubo(2, "ivec3", "out_limits")}
 ${layout_declare_ubo(3, "float", "minimum")}
 ${layout_declare_ubo(4, "float", "maximum")}
 
-
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-float hardswish(float x){
-    if(x <= -3) {
-      return 0;
-    }
-    else if(x >= 3) {
-      return x;
-    }
-    else {
-      return x * (x + 3)/6;
-    }
-}
+#include "activations.h"
 
 #ifdef USING_BUFFER
 
 void main() {
   const int i = int(gl_GlobalInvocationID.x);
-  if (i >= ntexels) {
+  if (i >= numel) {
     return;
   }
 
-  vec4 in_texel = vec4(t_in[i]);
-  t_out[i] = VEC4_T(op(in_texel, minimum, maximum));
+  float in_val = float(t_in[i]);
+  t_out[i] = T(op(in_val, minimum, maximum));
 }
 
 #else
diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
index f39abc2134..eb05b10b10 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
@@ -35,6 +35,6 @@ unary_op:
     - NAME: tanh
       OPERATOR: tanh(clamp(X, -15.0, 15.0))
     - NAME: hardshrink
-      OPERATOR: X * (vec4(greaterThan(X, vec4(A))) + vec4(lessThan(X, vec4(B))))
+      OPERATOR: hardshrink(X, A, B)
     - NAME: hardswish
-      OPERATOR: vec4(hardswish(X.x),hardswish(X.y),hardswish(X.z),hardswish(X.w))
+      OPERATOR: hardswish(X)
diff --git a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp b/backends/vulkan/runtime/graph/ops/impl/Cat.cpp
index 04acec5937..cd947091bc 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Cat.cpp
@@ -43,7 +43,7 @@ void add_cat_default_node(
       utils::ivec3 range = t_in->texture_limits();
       add_copy_offset_node(
           graph, input_ref, range, src_offset, dst_offset, out);
-      dst_offset.data[0] += range.data[0];
+      dst_offset[0] += range[0];
     }
 
   } else if (dim_index == kHeight4D) {
@@ -55,7 +55,7 @@ void add_cat_default_node(
       utils::ivec3 range = t_in->texture_limits();
       add_copy_offset_node(
           graph, input_ref, range, src_offset, dst_offset, out);
-      dst_offset.data[1] += range.data[1];
+      dst_offset[1] += range[1];
     }
   } else if (dim_index == kBatch4D) {
     utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
@@ -66,7 +66,7 @@ void add_cat_default_node(
       utils::ivec3 range = t_in->texture_limits();
       add_copy_offset_node(
           graph, input_ref, range, src_offset, dst_offset, out);
-      dst_offset.data[2] += range.data[2];
+      dst_offset[2] += range[2];
     }
   } else if (dim_index == kChannel4D) {
     int32_t src_offset = 0;
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index cbee886ad2..52af0542b6 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -242,10 +242,8 @@ Conv2dParams create_conv2d_params(
     const Kernel2dParams& p,
     const bool transposed) {
   const auto& overlay_region = utils::make_ivec2({
-      p.kernel_size.data[0] +
-          (p.kernel_size.data[0] - 1) * (p.dilation.data[0] - 1),
-      p.kernel_size.data[1] +
-          (p.kernel_size.data[1] - 1) * (p.dilation.data[1] - 1),
+      p.kernel_size[0] + (p.kernel_size[0] - 1) * (p.dilation[0] - 1),
+      p.kernel_size[1] + (p.kernel_size[1] - 1) * (p.dilation[1] - 1),
   });
   const auto weight_sizes = graph.sizes_of(weight);
   const int32_t in_group_size = utils::safe_downcast<int32_t>(
@@ -255,15 +253,13 @@ Conv2dParams create_conv2d_params(
 
 void check_conv2d_params(const Kernel2dParams& p, const bool transposed) {
   if (transposed) {
-    if (p.dilation.data[0] > 1 || p.dilation.data[1] > 1) {
+    if (p.dilation[0] > 1 || p.dilation[1] > 1) {
       VK_THROW(
           "aten.convolution.default: transposed = true, dilation > 1 is not supported yet!");
     }
   }
-  if ((p.padding.data[0] > 0 && p.kernel_size.data[0] > 1 &&
-       p.dilation.data[0] > 1) ||
-      (p.padding.data[1] > 0 && p.kernel_size.data[1] > 1 &&
-       p.dilation.data[1] > 1)) {
+  if ((p.padding[0] > 0 && p.kernel_size[0] > 1 && p.dilation[0] > 1) ||
+      (p.padding[1] > 0 && p.kernel_size[1] > 1 && p.dilation[1] > 1)) {
     VK_THROW(
         "aten.convolution.default: padding > 0 while dilation, kernel_size > 1 is not supported yet!");
   }
@@ -297,9 +293,9 @@ utils::uvec3 create_conv2d_global_wg_size(
   if (method == Conv2dMethod::Pointwise) {
     const utils::uvec3 image_extents = graph.image_extents_of(out);
     return {
-        utils::div_up(image_extents.data[0u], 2u),
-        utils::div_up(image_extents.data[1u], 2u),
-        image_extents.data[2u]};
+        utils::div_up(image_extents[0u], 2u),
+        utils::div_up(image_extents[1u], 2u),
+        image_extents[2u]};
   } else {
     return graph.create_global_wg_size(out);
   }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
index 7baf921bf0..e78fca15a0 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
@@ -62,7 +62,7 @@ void add_permute_node(
         !seen[permute_dim], "Argument dim ", permute_dim, "  is repeated");
     seen[permute_dim] = true;
 
-    out_dims.data[(4u - out_ndim) + i] = permute_dim + (4 - out_ndim);
+    out_dims[(4u - out_ndim) + i] = permute_dim + (4 - out_ndim);
   }
 
   std::string kernel_name = "permute";
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
index cf887b6c1a..732643ef75 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
@@ -87,12 +87,12 @@ void add_q_8w_linear_node(
   if (graph.is_buffer_storage(out)) {
     ubos.append(
         {graph.sizes_ubo(out),
-         graph.ntexels_ubo(out),
+         graph.strides_ubo(out),
+         graph.numel_ubo(out),
          graph.sizes_ubo(mat1),
-         graph.texel_strides_ubo(out),
-         graph.texel_strides_ubo(mat1),
-         graph.texel_strides_ubo(q_mat2),
-         graph.texel_strides_ubo(scales)});
+         graph.strides_ubo(mat1),
+         graph.strides_ubo(q_mat2),
+         graph.strides_ubo(scales)});
   } else {
     ubos.append({graph.texture_limits_ubo(out), graph.sizes_ubo(mat1)});
   }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
index 0eda7d8260..3ef80dc49c 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
@@ -83,8 +83,7 @@ void add_repeat_channel_node(
   utils::ivec4 in_whcn_sizes{in_width, in_height, in_channel, in_batch};
 
   // Channel packed global work ids
-  running_range.data[2] =
-      out_whcn_sizes.data[3] * utils::div_up_4(out_whcn_sizes.data[2]);
+  running_range[2] = out_whcn_sizes[3] * utils::div_up_4(out_whcn_sizes[2]);
   utils::uvec3 global_size = utils::make_uvec3(running_range);
   utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
@@ -165,7 +164,7 @@ void add_repeat_node(
           graph, out, running_range, src_offset, dst_offset, out);
     }
 
-    running_range.data[0] = running_range.data[0] * width_repeat;
+    running_range[0] = running_range[0] * width_repeat;
   }
 
   // Height
@@ -179,7 +178,7 @@ void add_repeat_node(
           graph, out, running_range, src_offset, dst_offset, out);
     }
 
-    running_range.data[1] = running_range.data[1] * height_repeat;
+    running_range[1] = running_range[1] * height_repeat;
   }
 
   // Batch
@@ -187,13 +186,13 @@ void add_repeat_node(
     utils::ivec3 src_offset{0, 0, 0};
 
     for (int i = 1; i < batch_repeat; ++i) {
-      utils::ivec3 dst_offset = {0, 0, i * running_range.data[2]};
+      utils::ivec3 dst_offset = {0, 0, i * running_range[2]};
 
       add_copy_offset_node(
           graph, out, running_range, src_offset, dst_offset, out);
     }
 
-    running_range.data[2] = running_range.data[2] * batch_repeat;
+    running_range[2] = running_range[2] * batch_repeat;
   }
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Split.cpp b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
index 9e3ae2e6a7..e093ccf1b7 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Split.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
@@ -53,7 +53,7 @@ void add_split_with_sizes_default_node(
       utils::ivec3 range = t_out->texture_limits();
       add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref);
 
-      src_offset.data[0] += range.data[0];
+      src_offset[0] += range[0];
     }
   } else if (dim_index == kHeight4D) {
     utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
@@ -64,7 +64,7 @@ void add_split_with_sizes_default_node(
       utils::ivec3 range = t_out->texture_limits();
       add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref);
 
-      src_offset.data[1] += range.data[1];
+      src_offset[1] += range[1];
     }
   } else if (dim_index == kBatch4D) {
     utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
@@ -75,7 +75,7 @@ void add_split_with_sizes_default_node(
       utils::ivec3 range = t_out->texture_limits();
       add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref);
 
-      src_offset.data[2] += range.data[2];
+      src_offset[2] += range[2];
     }
   } else if (dim_index == kChannel4D) {
     int32_t src_offset = 0;
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
index 79b463d7ef..b02613c208 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -24,12 +24,14 @@ void add_staging_to_tensor_node(
   vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(
       *graph.get_tensor(out_tensor), graph.int8_buffers_enabled());
 
-  vkapi::ParamsBindList ubos({graph.sizes_ubo(out_tensor)});
+  vkapi::ParamsBindList ubos;
   if (graph.is_buffer_storage(out_tensor)) {
-    ubos.append({
-        graph.texel_strides_ubo(out_tensor),
-        graph.ntexels_ubo(out_tensor),
-    });
+    ubos.append(
+        {graph.sizes_ubo(out_tensor),
+         graph.strides_ubo(out_tensor),
+         graph.numel_ubo(out_tensor)});
+  } else {
+    ubos.append(graph.sizes_ubo(out_tensor));
   }
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
@@ -59,9 +61,18 @@ void add_tensor_to_staging_node(
       *graph.get_tensor(in_tensor), graph.int8_buffers_enabled());
 
   utils::uvec3 global_wg_size = graph.create_global_wg_size(in_tensor);
-  vkapi::ParamsBindList ubos({graph.sizes_ubo(in_tensor)});
 
-  // Normally, the tensor_to_nchw shader is structured so that each thread reads
+  vkapi::ParamsBindList ubos;
+  if (graph.is_buffer_storage(in_tensor)) {
+    ubos.append(
+        {graph.sizes_ubo(in_tensor),
+         graph.strides_ubo(in_tensor),
+         graph.numel_ubo(in_tensor)});
+  } else {
+    ubos.append(graph.sizes_ubo(in_tensor));
+  }
+
+  // Normally, the image_to_nchw shader is structured so that each thread reads
   // one texel from the input texture and writes each component of the texel
   // into the corresponding location in the output buffer. However, this shader
   // is structured slightly differently in that each thread writes out a
@@ -69,17 +80,10 @@ void add_tensor_to_staging_node(
   // output buffer. Therefore, the global work group size for this shader will
   // be the number of elements in the output buffer divided by 4, as opposed to
   // the extents of the input texture.
-  if (shader.kernel_name == "int8_tensor_to_nchw_noint8") {
+  if (shader.kernel_name == "int8_image_to_nchw_noint8") {
     uint32_t buffer_len = graph.get_staging(out_staging)->numel() / 4;
     global_wg_size = {buffer_len, 1, 1};
-    ubos.append({graph.ntexels_ubo(in_tensor)});
-  }
-
-  if (graph.is_buffer_storage(in_tensor)) {
-    ubos.append({
-        graph.texel_strides_ubo(in_tensor),
-        graph.ntexels_ubo(in_tensor),
-    });
+    ubos.append({graph.numel_ubo(in_tensor)});
   }
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
@@ -88,8 +92,8 @@ void add_tensor_to_staging_node(
       global_wg_size,
       graph.create_local_wg_size(global_wg_size),
       // Input and Outputs
-      {{in_tensor, vkapi::MemoryAccessType::READ},
-       {out_staging, vkapi::MemoryAccessType::WRITE}},
+      {{out_staging, vkapi::MemoryAccessType::WRITE},
+       {in_tensor, vkapi::MemoryAccessType::READ}},
       // Parameter Buffers
       ubos,
       // Specialization Constants
@@ -105,12 +109,11 @@ ValueRef prepack(
   vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(
       *graph.get_tensor(v), graph.int8_buffers_enabled());
 
-  vkapi::ParamsBindList ubos({graph.sizes_ubo(v)});
+  vkapi::ParamsBindList ubos;
   if (graph.is_buffer_storage(v)) {
-    ubos.append({
-        graph.texel_strides_ubo(v),
-        graph.ntexels_ubo(v),
-    });
+    ubos.append({graph.sizes_ubo(v), graph.strides_ubo(v), graph.numel_ubo(v)});
+  } else {
+    ubos.append(graph.sizes_ubo(v));
   }
 
   graph.prepack_nodes().emplace_back(new PrepackNode(
diff --git a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
index 4342be7229..075c0bc923 100644
--- a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
@@ -44,7 +44,7 @@ void add_unary_op_node(
 
   vkapi::ParamsBindList ubos({});
   if (graph.is_buffer_storage(out)) {
-    ubos.append({graph.ntexels_ubo(out)});
+    ubos.append({graph.numel_ubo(out)});
   } else {
     ubos.append({graph.texture_limits_ubo(out)});
   }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp b/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp
index 29baff4bde..9183f2aea8 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp
@@ -69,8 +69,8 @@ void add_upsample_nearest2d_node(
   utils::uvec3 input_sizes = t_in->image_extents();
 
   utils::ivec2 input_size = {
-      utils::safe_downcast<int32_t>(input_sizes.data[0]),
-      utils::safe_downcast<int32_t>(input_sizes.data[1])};
+      utils::safe_downcast<int32_t>(input_sizes[0]),
+      utils::safe_downcast<int32_t>(input_sizes[1])};
   utils::vec2 rev_scales = {
       utils::safe_downcast<float>(1.0), utils::safe_downcast<float>(1.0)};
 
@@ -79,9 +79,9 @@ void add_upsample_nearest2d_node(
     auto output_size_ref = graph.get_int_list(output_sizes);
     rev_scales = {
         utils::safe_downcast<float>(
-            (float)input_size.data[0] / output_size_ref->at(1)),
+            (float)input_size[0] / output_size_ref->at(1)),
         utils::safe_downcast<float>(
-            (float)input_size.data[1] / output_size_ref->at(0))};
+            (float)input_size[1] / output_size_ref->at(0))};
 
   } else {
     auto scales = graph.get_double_list(scale_factors);
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp
index c5cef52f7a..2fb0f60b24 100644
--- a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp
@@ -85,18 +85,18 @@ std::vector<int64_t> calc_out_sizes_hw(
   // Height
   out_sizes.at(0) = calc_out_size(
       in_sizes.at(ndim - 2),
-      kernel_size.data[1],
-      stride.data[1],
-      padding.data[1],
-      dilation.data[1],
+      kernel_size[1],
+      stride[1],
+      padding[1],
+      dilation[1],
       ceil_mode);
   // Width
   out_sizes.at(1) = calc_out_size(
       in_sizes.at(ndim - 1),
-      kernel_size.data[0],
-      stride.data[0],
-      padding.data[0],
-      dilation.data[0],
+      kernel_size[0],
+      stride[0],
+      padding[0],
+      dilation[0],
       ceil_mode);
 
   return out_sizes;
@@ -128,19 +128,19 @@ std::vector<int64_t> calc_transpose_out_sizes_hw(
   // Height
   out_sizes.at(0) = calc_transpose_out_size(
       in_sizes.at(ndim - 2),
-      kernel_size.data[1],
-      stride.data[1],
-      padding.data[1],
-      dilation.data[1],
-      output_padding.data[1]);
+      kernel_size[1],
+      stride[1],
+      padding[1],
+      dilation[1],
+      output_padding[1]);
   // Width
   out_sizes.at(1) = calc_transpose_out_size(
       in_sizes.at(ndim - 1),
-      kernel_size.data[0],
-      stride.data[0],
-      padding.data[0],
-      dilation.data[0],
-      output_padding.data[0]);
+      kernel_size[0],
+      stride[0],
+      padding[0],
+      dilation[0],
+      output_padding[0]);
 
   return out_sizes;
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp
index 8d86c8287f..2737a86a1a 100644
--- a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp
@@ -101,15 +101,15 @@ utils::ivec2 create_broadcast_params(
 
 utils::uvec3 adaptive_work_group_size(const utils::uvec3& global_work_group) {
   utils::uvec3 local_group_size = {4, 4, 4};
-  if (global_work_group.data[2u] == 1) {
-    if (global_work_group.data[1u] < 8) {
-      local_group_size.data[0u] = 16;
-      local_group_size.data[1u] = 4;
-      local_group_size.data[2u] = 1;
+  if (global_work_group[2u] == 1) {
+    if (global_work_group[1u] < 8) {
+      local_group_size[0u] = 16;
+      local_group_size[1u] = 4;
+      local_group_size[2u] = 1;
     } else {
-      local_group_size.data[0u] = 8;
-      local_group_size.data[1u] = 8;
-      local_group_size.data[2u] = 1;
+      local_group_size[0u] = 8;
+      local_group_size[1u] = 8;
+      local_group_size[2u] = 1;
     }
   }
   return local_group_size;
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
index 2ade34e425..294e36b9a8 100644
--- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
@@ -103,10 +103,16 @@ vkapi::ShaderInfo get_nchw_to_tensor_shader(
 
   if (v_dst.dtype() == vkapi::kChar &&
       v_dst.storage_type() == utils::kTexture3D && !int8_buffer_enabled) {
-    return VK_KERNEL(nchw_to_int8_tensor_noint8);
+    return VK_KERNEL(nchw_to_int8_image_noint8);
   }
 
-  kernel_name = "nchw_to_tensor";
+  if (v_dst.storage_type() == utils::kBuffer) {
+    kernel_name = "nchw_to_buffer";
+    add_dtype_suffix(kernel_name, v_dst);
+    return VK_KERNEL_FROM_STR(kernel_name);
+  }
+
+  kernel_name = "nchw_to_image";
   add_dtype_suffix(kernel_name, v_dst);
   add_storage_type_suffix(kernel_name, v_dst);
 
@@ -121,10 +127,16 @@ vkapi::ShaderInfo get_tensor_to_nchw_shader(
 
   if (v_src.dtype() == vkapi::kChar &&
       v_src.storage_type() == utils::kTexture3D && !int8_buffer_enabled) {
-    return VK_KERNEL(int8_tensor_to_nchw_noint8);
+    return VK_KERNEL(int8_image_to_nchw_noint8);
+  }
+
+  if (v_src.storage_type() == utils::kBuffer) {
+    kernel_name = "buffer_to_nchw";
+    add_dtype_suffix(kernel_name, v_src);
+    return VK_KERNEL_FROM_STR(kernel_name);
   }
 
-  kernel_name = "tensor_to_nchw";
+  kernel_name = "image_to_nchw";
   add_dtype_suffix(kernel_name, v_src);
   add_storage_type_suffix(kernel_name, v_src);
 
diff --git a/backends/vulkan/runtime/utils/VecUtils.h b/backends/vulkan/runtime/utils/VecUtils.h
index bc0179e4a4..55bb0f7d1b 100644
--- a/backends/vulkan/runtime/utils/VecUtils.h
+++ b/backends/vulkan/runtime/utils/VecUtils.h
@@ -237,6 +237,16 @@ template <typename Type, uint32_t N>
 struct vec final {
   // NOLINTNEXTLINE
   Type data[N];
+
+  const Type& operator[](const uint32_t& i) const {
+    VK_CHECK_COND(i >= 0 && i < N, "Index out of bounds!");
+    return data[i];
+  }
+
+  Type& operator[](const uint32_t& i) {
+    VK_CHECK_COND(i >= 0 && i < N, "Index out of bounds!");
+    return data[i];
+  }
 };
 
 } // namespace detail
@@ -261,24 +271,22 @@ using vec4 = vec<4u>;
 
 // uvec3 is the type representing tensor extents. Useful for debugging.
 inline std::ostream& operator<<(std::ostream& os, const uvec3& v) {
-  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ")";
+  os << "(" << v[0u] << ", " << v[1u] << ", " << v[2u] << ")";
   return os;
 }
 
 inline std::ostream& operator<<(std::ostream& os, const ivec3& v) {
-  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ")";
+  os << "(" << v[0u] << ", " << v[1u] << ", " << v[2u] << ")";
   return os;
 }
 
 inline std::ostream& operator<<(std::ostream& os, const uvec4& v) {
-  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
-     << v.data[3u] << ")";
+  os << "(" << v[0u] << ", " << v[1u] << ", " << v[2u] << ", " << v[3u] << ")";
   return os;
 }
 
 inline std::ostream& operator<<(std::ostream& os, const ivec4& v) {
-  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
-     << v.data[3u] << ")";
+  os << "(" << v[0u] << ", " << v[1u] << ", " << v[2u] << ", " << v[3u] << ")";
   return os;
 }
 
@@ -288,7 +296,7 @@ inline detail::vec<T, N> divup_vec(
     const detail::vec<T, N>& b) {
   detail::vec<T, N> result;
   for (uint32_t i = 0; i < N; ++i) {
-    result.data[i] = utils::div_up(a.data[i], b.data[i]);
+    result[i] = utils::div_up(a[i], b[i]);
   }
   return result;
 }
@@ -369,7 +377,7 @@ inline ivec4 make_ivec4_prepadded1(const std::vector<int64_t>& ints) {
   ivec4 result = {1, 1, 1, 1};
   size_t base = 4 - ints.size();
   for (size_t i = 0; i < ints.size(); ++i) {
-    result.data[i + base] = safe_downcast<int32_t>(ints[i]);
+    result[i + base] = safe_downcast<int32_t>(ints[i]);
   }
 
   return result;
@@ -377,16 +385,16 @@ inline ivec4 make_ivec4_prepadded1(const std::vector<int64_t>& ints) {
 
 inline ivec3 make_ivec3(uvec3 ints) {
   return {
-      safe_downcast<int32_t>(ints.data[0u]),
-      safe_downcast<int32_t>(ints.data[1u]),
-      safe_downcast<int32_t>(ints.data[2u])};
+      safe_downcast<int32_t>(ints[0u]),
+      safe_downcast<int32_t>(ints[1u]),
+      safe_downcast<int32_t>(ints[2u])};
 }
 
 inline uvec3 make_uvec3(ivec3 ints) {
   return {
-      safe_downcast<uint32_t>(ints.data[0u]),
-      safe_downcast<uint32_t>(ints.data[1u]),
-      safe_downcast<uint32_t>(ints.data[2u])};
+      safe_downcast<uint32_t>(ints[0u]),
+      safe_downcast<uint32_t>(ints[1u]),
+      safe_downcast<uint32_t>(ints[2u])};
 }
 
 /*
diff --git a/backends/vulkan/runtime/vk_api/Command.cpp b/backends/vulkan/runtime/vk_api/Command.cpp
index 2803e3fc8d..713fd9917e 100644
--- a/backends/vulkan/runtime/vk_api/Command.cpp
+++ b/backends/vulkan/runtime/vk_api/Command.cpp
@@ -171,13 +171,10 @@ void CommandBuffer::dispatch(const utils::uvec3& global_workgroup_size) {
 
   vkCmdDispatch(
       handle_,
+      utils::div_up(global_workgroup_size[0u], bound_.local_workgroup_size[0u]),
+      utils::div_up(global_workgroup_size[1u], bound_.local_workgroup_size[1u]),
       utils::div_up(
-          global_workgroup_size.data[0u], bound_.local_workgroup_size.data[0u]),
-      utils::div_up(
-          global_workgroup_size.data[1u], bound_.local_workgroup_size.data[1u]),
-      utils::div_up(
-          global_workgroup_size.data[2u],
-          bound_.local_workgroup_size.data[2u]));
+          global_workgroup_size[2u], bound_.local_workgroup_size[2u]));
 
   state_ = CommandBuffer::State::RECORDING;
 }
diff --git a/backends/vulkan/runtime/vk_api/Descriptor.cpp b/backends/vulkan/runtime/vk_api/Descriptor.cpp
index 3d64fbf292..03b01c3fa8 100644
--- a/backends/vulkan/runtime/vk_api/Descriptor.cpp
+++ b/backends/vulkan/runtime/vk_api/Descriptor.cpp
@@ -38,6 +38,10 @@ ParamsBindList::ParamsBindList(
   std::copy(init_list.begin(), init_list.end(), bind_infos.begin());
 }
 
+void ParamsBindList::append(const BufferBindInfo& bind_info) {
+  bind_infos.emplace_back(bind_info);
+}
+
 void ParamsBindList::append(const ParamsBindList& other) {
   bind_infos.insert(
       bind_infos.end(), other.bind_infos.begin(), other.bind_infos.end());
diff --git a/backends/vulkan/runtime/vk_api/Descriptor.h b/backends/vulkan/runtime/vk_api/Descriptor.h
index 28a89149d4..418d79a6b3 100644
--- a/backends/vulkan/runtime/vk_api/Descriptor.h
+++ b/backends/vulkan/runtime/vk_api/Descriptor.h
@@ -39,8 +39,10 @@ struct BufferBindInfo final {
 struct ParamsBindList final {
   std::vector<BufferBindInfo> bind_infos;
 
+  ParamsBindList() = default;
   ParamsBindList(std::initializer_list<const BufferBindInfo> init_list);
 
+  void append(const BufferBindInfo& bind_info);
   void append(const ParamsBindList& other);
 };
 
diff --git a/backends/vulkan/runtime/vk_api/VkUtils.h b/backends/vulkan/runtime/vk_api/VkUtils.h
index 2b9b48d9ff..b765d417d4 100644
--- a/backends/vulkan/runtime/vk_api/VkUtils.h
+++ b/backends/vulkan/runtime/vk_api/VkUtils.h
@@ -14,7 +14,7 @@ namespace vkcompute {
 namespace vkapi {
 
 inline VkExtent3D create_extent3d(const utils::uvec3& extents) {
-  return VkExtent3D{extents.data[0u], extents.data[1u], extents.data[2u]};
+  return VkExtent3D{extents[0u], extents[1u], extents[2u]};
 }
 
 } // namespace vkapi
diff --git a/backends/vulkan/test/TARGETS b/backends/vulkan/test/TARGETS
index 5ac8789276..9e1a64e88a 100644
--- a/backends/vulkan/test/TARGETS
+++ b/backends/vulkan/test/TARGETS
@@ -18,6 +18,7 @@ python_unittest(
         "//executorch/backends/vulkan:vulkan_preprocess",
         "//executorch/backends/vulkan/partitioner:vulkan_partitioner",
         "//executorch/exir:lib",
+        "//executorch/exir/program:program",
         "//executorch/extension/pybindings:portable_lib",  # @manual
         "//executorch/extension/pytree:pylib",
         "//executorch/kernels/portable:custom_ops_generated_lib",
diff --git a/backends/vulkan/test/glsl/idx_fill_buffer.glsl b/backends/vulkan/test/glsl/idx_fill_buffer.glsl
index 98cf04e338..d32c52c205 100644
--- a/backends/vulkan/test/glsl/idx_fill_buffer.glsl
+++ b/backends/vulkan/test/glsl/idx_fill_buffer.glsl
@@ -10,39 +10,24 @@
 
 #define PRECISION ${PRECISION}
 
-#define VEC4_T ${buffer_gvec_type(DTYPE, 4)}
+#define T ${buffer_scalar_type(DTYPE)}
 
 #include "indexing_utils.h"
 
-$if DTYPE == "half":
-  #extension GL_EXT_shader_16bit_storage : require
-  #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
-$elif DTYPE == "int8":
-  #extension GL_EXT_shader_8bit_storage : require
-  #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
-$elif DTYPE == "uint8":
-  #extension GL_EXT_shader_8bit_storage : require
-  #extension GL_EXT_shader_explicit_arithmetic_types_uint8 : require
+${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
 
-layout(set = 0, binding = 0) buffer  PRECISION restrict writeonly Buffer {
-  VEC4_T data[];
-}
-buffer_in;
-
-layout(set = 0, binding = 1) uniform PRECISION restrict Params {
-  int len;
-}
-params;
+${layout_declare_buffer(0, "w", "out_buf", DTYPE, PRECISION, True)}
+${layout_declare_ubo(1, "int", "numel")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
-  const int i = ivec3(gl_GlobalInvocationID).x;
-
-  const int base = 4 * i;
-  if (base < params.len) {
-    buffer_in.data[i] = VEC4_T(base, base + 1, base + 2, base + 3);
+  const int t_id = ivec3(gl_GlobalInvocationID).x;
+  if (t_id >= numel) {
+    return;
   }
+
+  out_buf[t_id] = T(t_id);
 }
diff --git a/backends/vulkan/test/glsl/scalar_add_buffer.glsl b/backends/vulkan/test/glsl/scalar_add_buffer.glsl
index 7f6cb2db47..cd3a85a165 100644
--- a/backends/vulkan/test/glsl/scalar_add_buffer.glsl
+++ b/backends/vulkan/test/glsl/scalar_add_buffer.glsl
@@ -10,22 +10,14 @@
 
 #define PRECISION ${PRECISION}
 
-#define VEC4_T ${texel_load_type(DTYPE, "buffer")}
-
-$if DTYPE == "half":
-  #extension GL_EXT_shader_16bit_storage : require
-  #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
-$elif DTYPE == "int8":
-  #extension GL_EXT_shader_8bit_storage : require
-  #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
-$elif DTYPE == "uint8":
-  #extension GL_EXT_shader_8bit_storage : require
-  #extension GL_EXT_shader_explicit_arithmetic_types_uint8 : require
+${define_required_extensions(DTYPE)}
+
+#define T ${buffer_scalar_type(DTYPE)}
 
 layout(std430) buffer;
 
 ${layout_declare_tensor(0, "rw", "buffer_in", DTYPE, "buffer")}
-${layout_declare_ubo(1, "int", "ntexels")}
+${layout_declare_ubo(1, "int", "numel")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -33,9 +25,9 @@ layout(constant_id = 3) const float scalar = 2.0;
 
 void main() {
   const int t_id = ivec3(gl_GlobalInvocationID).x;
-  if (t_id >= ntexels) {
+  if (t_id >= numel) {
     return;
   }
 
-  buffer_in[t_id] = buffer_in[t_id] + VEC4_T(scalar);// buffer_in[t_id];
+  buffer_in[t_id] = buffer_in[t_id] + T(scalar);
 }
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index 649c0c82d6..e6f2863470 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -23,15 +23,13 @@ void record_nchw_to_buffer_op(
     vkapi::VulkanBuffer& src_buffer,
     api::vTensor& v_dst) {
   vkapi::PipelineBarrier pipeline_barrier{};
-  vkapi::SpecVarList specialization_constants = {
-      SV(v_dst.packed_dim_whcn_idx())};
 
   context->submit_compute_job(
       get_nchw_to_tensor_shader(v_dst),
       pipeline_barrier,
-      {uint32_t(v_dst.texel_numel()), 1, 1},
+      {uint32_t(v_dst.numel()), 1, 1},
       {64, 1, 1},
-      specialization_constants,
+      {},
       VK_NULL_HANDLE,
       0,
       v_dst.buffer(
@@ -40,8 +38,8 @@ void record_nchw_to_buffer_op(
           vkapi::MemoryAccessType::WRITE),
       src_buffer,
       v_dst.sizes_ubo(),
-      v_dst.texel_strides_ubo(),
-      v_dst.ntexels_ubo());
+      v_dst.strides_ubo(),
+      v_dst.numel_ubo());
 }
 
 void record_buffer_to_nchw_op(
@@ -49,22 +47,19 @@ void record_buffer_to_nchw_op(
     api::vTensor& v_src,
     vkapi::VulkanBuffer& dst_buffer) {
   vkapi::PipelineBarrier pipeline_barrier{};
-  vkapi::SpecVarList specialization_constants = {
-      SV(v_src.packed_dim_whcn_idx())};
-
   context->submit_compute_job(
       get_tensor_to_nchw_shader(v_src),
       pipeline_barrier,
-      {uint32_t(v_src.texel_numel()), 1, 1},
+      {uint32_t(v_src.numel()), 1, 1},
       {64, 1, 1},
-      specialization_constants,
+      {},
       VK_NULL_HANDLE,
       0,
-      v_src.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       dst_buffer,
+      v_src.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       v_src.sizes_ubo(),
-      v_src.texel_strides_ubo(),
-      v_src.ntexels_ubo());
+      v_src.strides_ubo(),
+      v_src.numel_ubo());
 }
 
 void record_nchw_to_image_op(
@@ -108,8 +103,8 @@ void record_image_to_nchw_op(
       specialization_constants,
       VK_NULL_HANDLE,
       0,
-      v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       dst_buffer,
+      v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       v_src.sizes_ubo());
 }
 
@@ -121,17 +116,17 @@ void record_int8_image_to_nchw_noint8_op(
   uint32_t buffer_len = utils::safe_downcast<uint32_t>(dst_buffer.numel() / 4);
   utils::uvec3 global_wg_size = {buffer_len, 1, 1};
   context->submit_compute_job(
-      VK_KERNEL(int8_tensor_to_nchw_noint8),
+      VK_KERNEL(int8_image_to_nchw_noint8),
       pipeline_barrier,
       global_wg_size,
       adaptive_work_group_size(global_wg_size),
       {v_src.packed_dim_whcn_idx()},
       VK_NULL_HANDLE,
       0,
-      v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       dst_buffer.buffer(),
+      v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       v_src.sizes_ubo(),
-      v_src.ntexels_ubo());
+      v_src.numel_ubo());
 }
 
 void record_conv2d_prepack_weights_op(
@@ -251,7 +246,7 @@ void record_index_fill_buffer(api::Context* context, api::vTensor& v_ten) {
     api::context()->submit_compute_job(
         VK_KERNEL_FROM_STR(kernel_name),
         pipeline_barrier,
-        {uint32_t(v_ten.texel_numel()), 1, 1},
+        {uint32_t(v_ten.numel()), 1, 1},
         {64, 1, 1},
         specialization_constants,
         VK_NULL_HANDLE,
@@ -275,7 +270,7 @@ void record_scalar_add_buffer(
   api::context()->submit_compute_job(
       VK_KERNEL_FROM_STR(kernel),
       pipeline_barrier,
-      {uint32_t(v_ten.texel_numel()), 1, 1},
+      {uint32_t(v_ten.numel()), 1, 1},
       {64, 1, 1},
       specialization_constants,
       VK_NULL_HANDLE,
@@ -284,7 +279,7 @@ void record_scalar_add_buffer(
           pipeline_barrier,
           vkapi::PipelineStage::COMPUTE,
           vkapi::MemoryAccessType::READ | vkapi::MemoryAccessType::WRITE),
-      v_ten.ntexels_ubo());
+      v_ten.numel_ubo());
 }
 
 //
@@ -302,15 +297,15 @@ void record_scalar_add_buffer(
 void fill_vtensor(api::vTensor& vten, std::vector<float>& data) {
   api::StorageBuffer staging_buffer(api::context(), vten.dtype(), data.size());
 
-#define CASE(ctype, name)                                          \
-  case vkapi::ScalarType::name: {                                  \
-    std::vector<ctype> data_converted;                             \
-    data_converted.resize(data.size());                            \
-    for (int i = 0; i < data.size(); ++i) {                        \
-      data_converted[i] = ctype(data[i]);                          \
-    }                                                              \
-    copy_ptr_to_staging(                                           \
-        data_converted.data(), staging_buffer, vten.gpu_nbytes()); \
+#define CASE(ctype, name)                                                     \
+  case vkapi::ScalarType::name: {                                             \
+    std::vector<ctype> data_converted;                                        \
+    data_converted.resize(data.size());                                       \
+    for (int i = 0; i < data.size(); ++i) {                                   \
+      data_converted[i] = ctype(data[i]);                                     \
+    }                                                                         \
+    copy_ptr_to_staging(                                                      \
+        data_converted.data(), staging_buffer, vten.staging_buffer_nbytes()); \
   } break;
 
   switch (vten.dtype()) {
@@ -329,7 +324,7 @@ void fill_vtensor(api::vTensor& vten, std::vector<float>& data) {
 }
 
 void fill_vtensor(api::vTensor& vten, float val, bool iota) {
-  std::vector<float> vten_data(vten.gpu_numel());
+  std::vector<float> vten_data(vten.staging_buffer_numel());
   if (iota) {
     std::iota(vten_data.begin(), vten_data.end(), val);
   } else {
@@ -344,7 +339,11 @@ void fill_vtensor(
     const IOValueRef idx,
     float val,
     bool iota) {
-  std::vector<float> data(graph.get_tensor(idx.value)->gpu_numel());
+  vTensorPtr t = graph.get_tensor(idx.value);
+  std::vector<float> data(t->numel());
+  if (t->storage_type() != utils::kBuffer) {
+    data.resize(t->staging_buffer_numel());
+  }
   if (iota) {
     std::iota(data.begin(), data.end(), val);
   } else {
@@ -356,7 +355,7 @@ void fill_vtensor(
 
 void extract_vtensor(api::vTensor& vten, std::vector<float>& data) {
   api::StorageBuffer staging_buffer(
-      api::context(), vten.dtype(), vten.gpu_numel());
+      api::context(), vten.dtype(), vten.staging_buffer_numel());
 
   if (vten.storage_type() == utils::StorageType::BUFFER) {
     record_buffer_to_nchw_op(api::context(), vten, staging_buffer.buffer());
@@ -368,14 +367,14 @@ void extract_vtensor(api::vTensor& vten, std::vector<float>& data) {
   api::context()->submit_cmd_to_gpu(fence.get_submit_handle());
   fence.wait();
 
-#define CASE(ctype, name)                                          \
-  case vkapi::ScalarType::name: {                                  \
-    std::vector<ctype> data_converted(data.size());                \
-    copy_staging_to_ptr(                                           \
-        staging_buffer, data_converted.data(), vten.gpu_nbytes()); \
-    for (int i = 0; i < data.size(); ++i) {                        \
-      data[i] = float(data_converted[i]);                          \
-    }                                                              \
+#define CASE(ctype, name)                                                     \
+  case vkapi::ScalarType::name: {                                             \
+    std::vector<ctype> data_converted(data.size());                           \
+    copy_staging_to_ptr(                                                      \
+        staging_buffer, data_converted.data(), vten.staging_buffer_nbytes()); \
+    for (int i = 0; i < data.size(); ++i) {                                   \
+      data[i] = float(data_converted[i]);                                     \
+    }                                                                         \
   } break;
 
   switch (vten.dtype()) {
@@ -431,7 +430,7 @@ void execute_graph_and_check_output(
     IOValueRef out_ioval = graph.outputs().at(i);
     vTensorPtr t_out = graph.get_tensor(out_ioval.value);
 
-    std::vector<float> output_data(t_out->gpu_numel());
+    std::vector<float> output_data(t_out->staging_buffer_numel());
     graph.copy_from_staging(
         out_ioval.staging, output_data.data(), output_data.size());
 
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
index 3dd9497e69..8b8a8d2373 100644
--- a/backends/vulkan/test/utils/test_utils.h
+++ b/backends/vulkan/test/utils/test_utils.h
@@ -36,16 +36,16 @@ using namespace vkcompute;
       utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED,  \
       allocate_memory);
 
-#define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor) \
-  api::StorageBuffer staging_buffer_##tensor(               \
-      api::context(), vkapi::kFloat, tensor.gpu_numel());   \
-  record_nchw_to_image_op(                                  \
+#define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor)          \
+  api::StorageBuffer staging_buffer_##tensor(                        \
+      api::context(), vkapi::kFloat, tensor.staging_buffer_numel()); \
+  record_nchw_to_image_op(                                           \
       api::context(), staging_buffer_##tensor.buffer(), tensor);
 
-#define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor) \
-  api::StorageBuffer staging_buffer_##tensor(                 \
-      api::context(), vkapi::kFloat, tensor.gpu_numel());     \
-  record_image_to_nchw_op(                                    \
+#define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor)        \
+  api::StorageBuffer staging_buffer_##tensor(                        \
+      api::context(), vkapi::kFloat, tensor.staging_buffer_numel()); \
+  record_image_to_nchw_op(                                           \
       api::context(), tensor, staging_buffer_##tensor.buffer());
 
 #define CHECK_VALUE(data, idx, expected)                          \
@@ -142,7 +142,7 @@ void fill_vtensor(
 void extract_vtensor(api::vTensor& vten, std::vector<float>& data);
 
 inline std::vector<float> extract_vtensor(api::vTensor& vten) {
-  std::vector<float> data_out(vten.gpu_numel());
+  std::vector<float> data_out(vten.staging_buffer_numel());
   extract_vtensor(vten, data_out);
   return data_out;
 }
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 6f0879c422..f1b24cc05c 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -8,7 +8,6 @@
 
 #include <gtest/gtest.h>
 
-#include <random>
 #include <utility>
 #include <vector>
 
@@ -70,22 +69,27 @@ TEST_F(VulkanComputeAPITest, print_adapter) {
 std::vector<int64_t> get_reference_strides(
     const std::vector<int64_t>& sizes,
     const utils::GPUMemoryLayout layout,
-    const bool texel_strides) {
+    const bool unsqueezed = false) {
   int64_t C = utils::val_at(-3, sizes);
   int64_t H = utils::val_at(-2, sizes);
   int64_t W = utils::val_at(-1, sizes);
 
+  int64_t numel = utils::multiply_integers(sizes);
+
   switch (layout) {
     case utils::kWidthPacked:
-      if (texel_strides) {
-        W = utils::div_up(W, INT64_C(4));
-      }
       switch (sizes.size()) {
         case 1:
+          if (unsqueezed)
+            return {numel, numel, numel, 1};
           return {1};
         case 2:
+          if (unsqueezed)
+            return {numel, numel, W, 1};
           return {W, 1};
         case 3:
+          if (unsqueezed)
+            return {numel, H * W, W, 1};
           return {H * W, W, 1};
         case 4:
           return {C * H * W, H * W, W, 1};
@@ -94,15 +98,18 @@ std::vector<int64_t> get_reference_strides(
       }
       break;
     case utils::kHeightPacked:
-      if (texel_strides) {
-        H = utils::div_up(H, INT64_C(4));
-      }
       switch (sizes.size()) {
         case 1:
+          if (unsqueezed)
+            return {numel, numel, numel, 1};
           return {1};
         case 2:
+          if (unsqueezed)
+            return {numel, numel, 1, H};
           return {1, H};
         case 3:
+          if (unsqueezed)
+            return {numel, H * W, 1, H};
           return {W * H, 1, H};
         case 4:
           return {C * W * H, W * H, 1, H};
@@ -110,15 +117,18 @@ std::vector<int64_t> get_reference_strides(
           return {};
       }
     case utils::kChannelsPacked:
-      if (texel_strides) {
-        C = utils::div_up(C, INT64_C(4));
-      }
       switch (sizes.size()) {
         case 1:
+          if (unsqueezed)
+            return {numel, numel, numel, 1};
           return {1};
         case 2:
+          if (unsqueezed)
+            return {numel, numel, W, 1};
           return {W, 1};
         case 3:
+          if (unsqueezed)
+            return {numel, 1, W * C, C};
           return {1, W * C, C};
         case 4:
           return {H * W * C, 1, W * C, C};
@@ -136,26 +146,45 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
     }
     for (const auto& layout :
          {utils::kWidthPacked, utils::kHeightPacked, utils::kChannelsPacked}) {
-      // texel_strides = true
       {
         std::vector<int64_t> strides = calculate_strides(sizes, layout);
-        std::vector<int64_t> ref_strides =
-            get_reference_strides(sizes, layout, true);
-
+        std::vector<int64_t> ref_strides = get_reference_strides(sizes, layout);
         ASSERT_TRUE(strides == ref_strides);
-      }
 
-      // texel_strides = false
-      {
-        std::vector<int64_t> strides = calculate_strides(sizes, layout, false);
-        std::vector<int64_t> ref_strides =
-            get_reference_strides(sizes, layout, false);
-        ASSERT_TRUE(strides == ref_strides);
+        int64_t numel = utils::multiply_integers(sizes);
+        std::vector<int64_t> unsqueezed_strides =
+            unsqueeze_strides(strides, numel);
+        std::vector<int64_t> ref_unsqueezed_strides =
+            get_reference_strides(sizes, layout, true);
+
+        ASSERT_TRUE(unsqueezed_strides == ref_unsqueezed_strides);
       }
     }
   }
 }
 
+TEST_F(VulkanComputeAPITest, vec_test) {
+  utils::vec3 v3({1, 2, 3});
+  ASSERT_TRUE(v3[0] == 1);
+  ASSERT_TRUE(v3[1] == 2);
+  ASSERT_TRUE(v3[2] == 3);
+  v3 = {4, 5, 6};
+  ASSERT_TRUE(v3[0] == 4);
+  ASSERT_TRUE(v3[1] == 5);
+  ASSERT_TRUE(v3[2] == 6);
+
+  utils::uvec4 uv4({4, 3, 2, 1});
+  ASSERT_TRUE(uv4[0] == 4);
+  ASSERT_TRUE(uv4[1] == 3);
+  ASSERT_TRUE(uv4[2] == 2);
+  ASSERT_TRUE(uv4[3] == 1);
+  uv4 = {11, 13, 12, 88};
+  ASSERT_TRUE(uv4[0] == 11);
+  ASSERT_TRUE(uv4[1] == 13);
+  ASSERT_TRUE(uv4[2] == 12);
+  ASSERT_TRUE(uv4[3] == 88);
+}
+
 TEST_F(VulkanComputeAPITest, retrieve_custom_shader_test) {
   // Try to get shader from custom shader library
   const vkapi::ShaderInfo& kernel = VK_KERNEL(test_shader);
@@ -283,7 +312,8 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) {
         params.buffer());
   }
 
-  StorageBuffer staging_buffer(context(), vkapi::kFloat, a.gpu_numel());
+  StorageBuffer staging_buffer(
+      context(), vkapi::kFloat, a.staging_buffer_numel());
   record_image_to_nchw_op(context(), a, staging_buffer.buffer());
 
   submit_to_gpu();
@@ -491,9 +521,9 @@ TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) {
   // No allocations made so far
   EXPECT_TRUE(get_vma_allocation_count() == 0);
 
-  std::vector<float> data_a(a.gpu_numel());
+  std::vector<float> data_a(a.staging_buffer_numel());
   std::fill(data_a.begin(), data_a.end(), 2.5f);
-  std::vector<float> data_b(b.gpu_numel());
+  std::vector<float> data_b(b.staging_buffer_numel());
   std::fill(data_b.begin(), data_b.end(), 1.5f);
 
   // Allocate memory at the last possible opportunity
@@ -512,7 +542,7 @@ TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) {
 
   record_binary_op(context(), "add", a, b, c);
 
-  std::vector<float> data_c(c.gpu_numel());
+  std::vector<float> data_c(c.staging_buffer_numel());
   extract_vtensor(c, data_c);
 
   for (size_t i = 0; i < data_c.size(); ++i) {
@@ -552,11 +582,11 @@ TEST_F(VulkanComputeAPITest, texture_resource_aliasing_test) {
   EXPECT_TRUE(get_vma_allocation_count() == 3);
 
   // Specify input data
-  std::vector<float> data_a(a.gpu_numel());
+  std::vector<float> data_a(a.staging_buffer_numel());
   std::fill(data_a.begin(), data_a.end(), 2.5f);
-  std::vector<float> data_b(b.gpu_numel());
+  std::vector<float> data_b(b.staging_buffer_numel());
   std::fill(data_b.begin(), data_b.end(), 1.5f);
-  std::vector<float> data_d(b.gpu_numel());
+  std::vector<float> data_d(b.staging_buffer_numel());
   std::fill(data_d.begin(), data_d.end(), 1.0f);
 
   // First, fill a and b with data
@@ -573,7 +603,7 @@ TEST_F(VulkanComputeAPITest, texture_resource_aliasing_test) {
   record_binary_op(context(), "add", c, d, e);
 
   // Extract data from e
-  std::vector<float> data_e(e.gpu_numel());
+  std::vector<float> data_e(e.staging_buffer_numel());
   extract_vtensor(e, data_e);
 
   // Sanity check that the values are correct
@@ -626,7 +656,7 @@ TEST_F(VulkanComputeAPITest, use_non_bound_textures_fails) {
   // No allocations yet
   EXPECT_TRUE(get_vma_allocation_count() == 0);
 
-  std::vector<float> data_a(a.gpu_numel());
+  std::vector<float> data_a(a.staging_buffer_numel());
   std::fill(data_a.begin(), data_a.end(), 2.5f);
 
   // Encoding a command buffer with a vTensor without memory should throw
@@ -719,14 +749,18 @@ TEST_F(VulkanComputeAPITest, texture_virtual_resize) {
     b.virtual_resize(new_sizes);
     c.virtual_resize(new_sizes);
 
-    fill_staging(staging_buffer_a, float(new_sizes[1] + 1.5f), a.gpu_numel());
-    fill_staging(staging_buffer_b, float(new_sizes[2] + 55.0f), b.gpu_numel());
+    fill_staging(
+        staging_buffer_a, float(new_sizes[1] + 1.5f), a.staging_buffer_numel());
+    fill_staging(
+        staging_buffer_b,
+        float(new_sizes[2] + 55.0f),
+        b.staging_buffer_numel());
 
     submit_to_gpu();
     check_staging_buffer(
         staging_buffer_c,
         float(new_sizes[1] + new_sizes[2] + 56.5f),
-        c.gpu_numel());
+        c.staging_buffer_numel());
   }
 }
 
@@ -734,8 +768,9 @@ TEST_F(VulkanComputeAPITest, texture_virtual_resize) {
 // Compute Graph Tests
 //
 
-#define EXTRACT_TENSOR(name)                                                 \
-  std::vector<float> data_##name(graph.get_tensor(name.value)->gpu_numel()); \
+#define EXTRACT_TENSOR(name)                                 \
+  std::vector<float> data_##name(                            \
+      graph.get_tensor(name.value)->staging_buffer_numel()); \
   graph.copy_from_staging(name.staging, data_##name.data(), data_##name.size());
 
 TEST(VulkanComputeGraphTest, test_values_scalars) {
@@ -1717,7 +1752,7 @@ void run_from_gpu_test(
         vten.sizes_ubo());
   }
 
-  StorageBuffer staging_buffer(context(), dtype, vten.gpu_numel());
+  StorageBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel());
 
   if (dtype == vkapi::kChar &&
       !context()->adapter_ptr()->has_full_int8_buffers_support()) {
@@ -1750,16 +1785,19 @@ void round_trip_test(
   vTensor vten = vTensor(context(), sizes, dtype, storage_type, memory_layout);
 
   // Create and fill input staging buffer
-  StorageBuffer staging_buffer_in(context(), dtype, vten.gpu_numel());
+  StorageBuffer staging_buffer_in(
+      context(), dtype, vten.staging_buffer_numel());
 
   std::vector<T> data_in(staging_buffer_in.numel());
   for (int i = 0; i < staging_buffer_in.numel(); i++) {
     data_in[i] = T(i * -1);
   }
-  copy_ptr_to_staging(data_in.data(), staging_buffer_in, vten.gpu_nbytes());
+  copy_ptr_to_staging(
+      data_in.data(), staging_buffer_in, vten.staging_buffer_nbytes());
 
   // Output staging buffer
-  StorageBuffer staging_buffer_out(context(), dtype, vten.gpu_numel());
+  StorageBuffer staging_buffer_out(
+      context(), dtype, vten.staging_buffer_numel());
 
   record_nchw_to_image_op(context(), staging_buffer_in.buffer(), vten);
 
@@ -1817,7 +1855,7 @@ void compute_graph_round_trip_test(
 
   graph.execute();
 
-  std::vector<T> data_out(tensor->gpu_numel());
+  std::vector<T> data_out(tensor->staging_buffer_numel());
   graph.copy_from_staging(r_staging_out, data_out.data(), data_out.size());
 
   for (int i = 0; i < data_in.size(); i++) {
@@ -2147,18 +2185,18 @@ void test_max_pool2d(
   fill_vtensor(graph, graph.inputs().at(0), base_val, /*iota = */ true);
 
   vTensorPtr t_in = graph.get_tensor(in_ioval.value);
-  std::vector<float> input_data(t_in->gpu_numel());
+  std::vector<float> input_data(t_in->staging_buffer_numel());
   graph.copy_from_staging(
       in_ioval.staging, input_data.data(), input_data.size());
 
   graph.execute();
 
   vTensorPtr t_out = graph.get_tensor(out_ioval.value);
-  std::vector<float> output_data(t_out->gpu_numel());
+  std::vector<float> output_data(t_out->staging_buffer_numel());
   graph.copy_from_staging(
       out_ioval.staging, output_data.data(), output_data.size());
   vTensorPtr t_idx = graph.get_tensor(idx_ioval.value);
-  std::vector<int> index_data(t_idx->gpu_numel());
+  std::vector<int> index_data(t_idx->staging_buffer_numel());
   graph.copy_from_staging(
       idx_ioval.staging, index_data.data(), index_data.size());
 
@@ -2296,7 +2334,7 @@ void test_grid_priors(
   // run graph
   graph.execute();
 
-  std::vector<float> output_data(t_out->gpu_numel());
+  std::vector<float> output_data(t_out->staging_buffer_numel());
   graph.copy_from_staging(out.staging, output_data.data(), output_data.size());
 
   // check results
diff --git a/backends/xnnpack/operators/__init__.py b/backends/xnnpack/operators/__init__.py
index d25cc58d5a..b2653a5fdc 100644
--- a/backends/xnnpack/operators/__init__.py
+++ b/backends/xnnpack/operators/__init__.py
@@ -10,6 +10,7 @@
     op_add,
     op_addmm,
     op_avg_pooling2d,
+    op_bmm,
     op_cat,
     op_ceiling,
     op_clamp,
diff --git a/backends/xnnpack/operators/op_bmm.py b/backends/xnnpack/operators/op_bmm.py
new file mode 100644
index 0000000000..8c008a5554
--- /dev/null
+++ b/backends/xnnpack/operators/op_bmm.py
@@ -0,0 +1,54 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import torch
+from executorch.backends.xnnpack.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
+    XNNBatchMatrixMultiply,
+    XNNGraph,
+    XNode,
+)
+from executorch.backends.xnnpack.utils.utils import get_input_node
+
+
+@register_node_visitor
+class BMMVisitor(NodeVisitor):
+    target = "aten.bmm.default"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        xnn_graph: XNNGraph,
+        vals_to_ids: Dict[torch.fx.Node, int],
+        debug_handle: int,
+    ) -> None:
+
+        self.define_nodes_tensor_inputs_outputs(node, xnn_graph, vals_to_ids)
+
+        # input1
+        input1_id = vals_to_ids[get_input_node(node, 0)]
+
+        # input2
+        input2_id = vals_to_ids[get_input_node(node, 1)]
+
+        # output
+        output_id = vals_to_ids[node]
+
+        ser_node = XNode(
+            xnode_union=XNNBatchMatrixMultiply(
+                input1_id=input1_id, input2_id=input2_id, output_id=output_id, flags=0
+            ),
+            debug_handle=debug_handle,
+        )
+        xnn_graph.xnodes.append(ser_node)
diff --git a/backends/xnnpack/partition/config/__init__.py b/backends/xnnpack/partition/config/__init__.py
index ed719dc40f..ab1eca1739 100644
--- a/backends/xnnpack/partition/config/__init__.py
+++ b/backends/xnnpack/partition/config/__init__.py
@@ -17,6 +17,7 @@
     AbsConfig,
     AddConfig,
     AvgPoolingConfig,
+    BMMConfig,
     CatConfig,
     CeilConfig,
     ClampConfig,
@@ -60,6 +61,7 @@
     AddmmConfig,
     AvgPoolingConfig,
     BatchNormConfig,
+    BMMConfig,
     CatConfig,
     CeilConfig,
     ConstantPadConfig,
diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py
index f0eac36696..8a87fe67f1 100644
--- a/backends/xnnpack/partition/config/generic_node_configs.py
+++ b/backends/xnnpack/partition/config/generic_node_configs.py
@@ -403,3 +403,15 @@ class SubConfig(GenericNodePartitionerConfig):
 
     def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT]
+
+
+class BMMConfig(GenericNodePartitionerConfig):
+    """
+    Despite being a GEMM Kernel, BMM Can be partitioned like a single node partitioner
+    because it does not perform any packing on the inputs being matrix multiplied
+    """
+
+    target_name = "bmm.default"
+
+    def supported_precision_types(self) -> List[ConfigPrecisionType]:
+        return [ConfigPrecisionType.FP32]
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
index 8c8db60065..314e38aad3 100644
--- a/backends/xnnpack/runtime/XNNCompiler.cpp
+++ b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -1504,6 +1504,35 @@ Error defineScaledDotProductAttentionNode(
 
   return Error::Ok;
 }
+
+/*
+Defines batch matrix multiply node into the subgraph,
+using the remapped ids to map the serialized ids,
+to the new ids generated when defining the tensor value
+*/
+Error defineBatchMatrixMultiplyNode(
+    xnn_subgraph_t subgraph_ptr,
+    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
+    const NodePtr node) noexcept {
+  auto graph_node = node->xnode_union_as_XNNBatchMatrixMultiply();
+
+  xnn_status status = xnn_define_batch_matrix_multiply(
+      subgraph_ptr,
+      remapped_ids.at(graph_node->input1_id()),
+      remapped_ids.at(graph_node->input2_id()),
+      remapped_ids.at(graph_node->output_id()),
+      graph_node->flags());
+
+  ET_CHECK_OR_RETURN_ERROR(
+      status == xnn_status_success,
+      Internal,
+      "Failed to create BMM node %i with code: %s",
+      node->debug_handle(),
+      xnn_status_to_string(status));
+
+  return Error::Ok;
+}
+
 /*
 Returns not Implemented Error code. This function is meant to be
 called when the compiler encountes a XNodeType from the flatbuffer
@@ -1566,6 +1595,7 @@ DefineNodeFunc getDefineNodeFunc(fb_xnnpack::XNodeUnion nodeType) {
     _DEFINE(Concatenate4)
     _DEFINE(StaticSlice)
     _DEFINE(ScaledDotProductAttention)
+    _DEFINE(BatchMatrixMultiply)
     case fb_xnnpack::XNodeUnion::NONE:
     default: // Adding here as a catch all, just in case
       return &defineNotImplementedNode;
diff --git a/backends/xnnpack/serialization/runtime_schema.fbs b/backends/xnnpack/serialization/runtime_schema.fbs
index 5ace211149..f32e7c6063 100644
--- a/backends/xnnpack/serialization/runtime_schema.fbs
+++ b/backends/xnnpack/serialization/runtime_schema.fbs
@@ -134,6 +134,7 @@ union XNodeUnion {
   XNNConcatenate4: _XNNCat,
   XNNStaticSlice,
   XNNScaledDotProductAttention,
+  XNNBatchMatrixMultiply: _XNNNode2x1,
 }
 
 union XValueUnion {
diff --git a/backends/xnnpack/serialization/schema.fbs b/backends/xnnpack/serialization/schema.fbs
index b968c6d9e9..773a459bbf 100644
--- a/backends/xnnpack/serialization/schema.fbs
+++ b/backends/xnnpack/serialization/schema.fbs
@@ -130,6 +130,7 @@ union XNodeUnion {
   XNNConcatenate4: _XNNCat,
   XNNStaticSlice,
   XNNScaledDotProductAttention,
+  XNNBatchMatrixMultiply: _XNNNode2x1,
 }
 
 union XValueUnion {
diff --git a/backends/xnnpack/serialization/xnnpack_graph_schema.py b/backends/xnnpack/serialization/xnnpack_graph_schema.py
index 9127474c91..e3e699c58f 100644
--- a/backends/xnnpack/serialization/xnnpack_graph_schema.py
+++ b/backends/xnnpack/serialization/xnnpack_graph_schema.py
@@ -177,6 +177,11 @@ class XNNConcatenate4(XNNCat):
     pass
 
 
+@dataclass
+class XNNBatchMatrixMultiply(XNNNode2x1):
+    pass
+
+
 @dataclass
 class XNNStaticTranspose:
     num_dims: int
@@ -354,6 +359,7 @@ class XNNScaledDotProductAttention:
     XNNConcatenate4,
     XNNStaticSlice,
     XNNScaledDotProductAttention,
+    XNNBatchMatrixMultiply,
 ]
 
 
diff --git a/backends/xnnpack/test/ops/bmm.py b/backends/xnnpack/test/ops/bmm.py
new file mode 100644
index 0000000000..1c6235e5f7
--- /dev/null
+++ b/backends/xnnpack/test/ops/bmm.py
@@ -0,0 +1,46 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.xnnpack.test.tester import Tester
+
+
+class TestBMM(unittest.TestCase):
+    class BMM(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x, y):
+            return torch.bmm(x, y)
+
+    def _test_bmm(self, inputs):
+        (
+            Tester(self.BMM(), inputs)
+            .export()
+            .check_count({"torch.ops.aten.bmm.default": 1})
+            .to_edge_transform_and_lower()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .check_not(["executorch_exir_dialects_edge__ops_aten_bmm_default"])
+            .to_executorch()
+            .serialize()
+            .run_method_and_compare_outputs()
+        )
+
+    def test_fp16_bmm(self):
+        inputs = (
+            torch.randn(2, 3, 4).to(torch.float16),
+            torch.randn(2, 4, 6).to(torch.float16),
+        )
+        self._test_bmm(inputs)
+
+    def test_fp32_bmm(self):
+        inputs = (
+            torch.randn(2, 3, 4),
+            torch.randn(2, 4, 6),
+        )
+        self._test_bmm(inputs)
diff --git a/build/build_apple_frameworks.sh b/build/build_apple_frameworks.sh
index 73635c3f90..a22fd4ecb9 100755
--- a/build/build_apple_frameworks.sh
+++ b/build/build_apple_frameworks.sh
@@ -76,7 +76,7 @@ usage() {
   echo
   echo "Options:"
   echo "  --output=DIR         Output directory. Default: 'cmake-out'"
-  echo "  --Debug              Use Debug build mode. Default: 'Release'"
+  echo "  --Debug              Use Debug build mode. Default: Uses Release build mode."
   echo "  --toolchain=FILE     Cmake toolchain file. Default: '\$SOURCE_ROOT_DIR/third-party/ios-cmake/ios.toolchain.cmake'"
   echo "  --buck2=FILE         Buck2 executable path. Default: Path of buck2 found in the current \$PATH"
   echo "  --python=FILE        Python executable path. Default: Path of python3 found in the current \$PATH"
@@ -90,7 +90,7 @@ usage() {
   echo "  --xnnpack            Include this flag to build the XNNPACK backend."
   echo
   echo "Example:"
-  echo "  $0 /path/to/source/root --output=cmake-out --Release --toolchain=/path/to/cmake/toolchain --buck2=/path/to/buck2 --python=/path/to/python3 --coreml --mps --xnnpack"
+  echo "  $0 /path/to/source/root --output=cmake-out --toolchain=/path/to/cmake/toolchain --buck2=/path/to/buck2 --python=/path/to/python3 --coreml --mps --xnnpack"
   exit 0
 }
 
diff --git a/docs/source/build-run-coreml.md b/docs/source/build-run-coreml.md
index 39794ac06c..52755773ee 100644
--- a/docs/source/build-run-coreml.md
+++ b/docs/source/build-run-coreml.md
@@ -127,7 +127,7 @@ python examples/apple/coreml/scripts/inspector_cli.py --etdump_path etdump.etdp
 1. Build frameworks, running the following will create a `executorch.xcframework` and `coreml_backend.xcframework` in the `cmake-out` directory.
 ```bash
 cd executorch
-./build/build_apple_frameworks.sh --Release --coreml
+./build/build_apple_frameworks.sh --coreml
 ```
 2. Create a new [Xcode project](https://developer.apple.com/documentation/xcode/creating-an-xcode-project-for-an-app#) or open an existing project.
 
diff --git a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
index ff5cb51595..c774ae57b4 100644
--- a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
+++ b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
@@ -5,6 +5,7 @@ build ExecuTorch for Qualcomm AI Engine Direct and running a model on it.
 
 Qualcomm AI Engine Direct is also referred to as QNN in the source and documentation.
 
+
 <!----This will show a grid card on the page----->
 ::::{grid} 2
 :::{grid-item-card}  What you will learn in this tutorial:
@@ -35,11 +36,10 @@ Currently, this ExecuTorch Backend can delegate AI computations to Hexagon proce
 
 ### Host OS
 
-The Linux host operating system that QNN Backend is verified with is Ubuntu 20.04 LTS x64.
-
-However, because Qualcomm Package Manager(QPM) used to download necessary SDK (see below)
-only support Ubuntu, we recommend users to exercise this tutorial exacly
-on Ubuntu 20.04.
+The Linux host operating system that QNN Backend is verified with is Ubuntu 22.04 LTS x64
+at the moment of updating this tutorial.
+Usually, we verified the backend on the same OS version which QNN is verified with.
+The version is documented in QNN SDK.
 
 ### Hardware:
 You will need an Android smartphone with adb-connected running on one of below Qualcomm SoCs:
@@ -53,20 +53,18 @@ This example is verified with SM8550 and SM8450.
 ### Software:
 
  - Follow ExecuTorch recommended Python version.
- - A compiler to compile AOT parts. GCC 9.4 come with Ubuntu20.04 is verified.
- - [Android NDK](https://developer.android.com/ndk). This example is verified with NDK 25c.
+ - A compiler to compile AOT parts, e.g., the GCC compiler comes with Ubuntu LTS.
+ - [Android NDK](https://developer.android.com/ndk). This example is verified with NDK 26c.
  - [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk)
-   - Follow the download button. After logging in, search Qualcomm AI Stack at the *Tool* panel.
-   - You can find Qualcomm AI Engine Direct SDK under the AI Stack group.
-   - Please download the Linux version, and follow instructions on the page to extract the file.
-   - The SDK should be installed to somewhere `/opt/qcom/aistack/qnn` by default.
-   - It's also OK to place it somewhere else. We don't have assumption about the absolute path of the SDK.
-   - This example is verified with version 2.12.0.
+   - Click the "Get Software" button to download a version of QNN SDK.
+   - However, at the moment of updating this tutorial, the above website doesn't provide QNN SDK newer than 2.22.6.
+   - The below is public links to download various QNN versions. Hope they can be publicly discoverable soon.
+   - [QNN 2.25.0](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.25.0.240728.zip)
+   - [QNN 2.24.0](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.24.0.240626.zip)
+   - [QNN 2.23.0](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.23.0.24.06.24.zip)
 
 The directory with installed Qualcomm AI Engine Direct SDK looks like:
 ```
-$ tree -L 1 /opt/qcom/aistack/qnn/<version>/
-/opt/qcom/aistack/qnn/<version>/
 ├── benchmarks
 ├── bin
 ├── docs
@@ -74,11 +72,15 @@ $ tree -L 1 /opt/qcom/aistack/qnn/<version>/
 ├── include
 ├── lib
 ├── LICENSE.pdf
+├── NOTICE.txt
+├── NOTICE_WINDOWS.txt
 ├── QNN_NOTICE.txt
 ├── QNN_README.txt
 ├── QNN_ReleaseNotes.txt
-├── share
-└── Uninstall
+├── ReleaseNotes.txt
+├── ReleaseNotesWindows.txt
+├── sdk.yaml
+└── share
 ```
 
 
@@ -89,7 +91,7 @@ $ tree -L 1 /opt/qcom/aistack/qnn/<version>/
 `$QNN_SDK_ROOT` refers to the root of Qualcomm AI Engine Direct SDK,
 i.e., the directory containing `QNN_README.txt`.
 
-`$ANDROID_NDK` refers to the root of Android NDK.
+`$ANDROID_NDK_ROOT` refers to the root of Android NDK.
 
 `$EXECUTORCH_ROOT` refers to the root of executorch git repository.
 
@@ -107,7 +109,16 @@ export PYTHONPATH=$EXECUTORCH_ROOT/..
 
 ## Build
 
-An example script for below building instructions is [here](https://github.com/pytorch/executorch/blob/main/backends/qualcomm/scripts/build.sh).
+An example script for the below building instructions is [here](https://github.com/pytorch/executorch/blob/main/backends/qualcomm/scripts/build.sh).
+We recommend to use the script because the ExecuTorch build-command can change from time to time.
+The above script is actively used. It is updated more frquently than this tutorial.
+An example usage is
+```bash
+cd $EXECUTORCH_ROOT
+./backends/qualcomm/scripts/build.sh
+# or
+./backends/qualcomm/scripts/build.sh --release
+```
 
 ### AOT (Ahead-of-time) components:
 
@@ -115,14 +126,23 @@ Python APIs on x64 are required to compile models to Qualcomm AI Engine Direct b
 
 ```bash
 cd $EXECUTORCH_ROOT
-# Workaround for fbs files in exir/_serialize
-cp schema/program.fbs exir/_serialize/program.fbs
-cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
-
-mkdir build_x86_64
-cd build_x86_64
-cmake .. -DEXECUTORCH_BUILD_QNN=ON -DQNN_SDK_ROOT=${QNN_SDK_ROOT}
-cmake --build . -t "PyQnnManagerAdaptor" "PyQnnWrapperAdaptor" -j8
+mkdir cmake-out
+cd cmake-out
+# Note that the below command might change.
+# Please refer to the above build.sh for latest workable commands.
+cmake .. \
+  -DCMAKE_INSTALL_PREFIX=$PWD \
+  -DEXECUTORCH_BUILD_QNN=ON \
+  -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
+  -DEXECUTORCH_BUILD_SDK=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
+  -DPYTHON_EXECUTABLE=python3 \
+  -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF
+
+# nproc is used to detect the number of available CPU.
+# If it is not applicable, please feel free to use the number you want.
+cmake --build $PWD --target "PyQnnManagerAdaptor" "PyQnnWrapperAdaptor" -j$(nproc)
 
 # install Python APIs to correct import path
 # The filename might vary depending on your Python and host version.
@@ -138,49 +158,58 @@ Commands to build `qnn_executor_runner` for Android:
 
 ```bash
 cd $EXECUTORCH_ROOT
-mkdir build_android
-cd build_android
+mkdir cmake-out-android
+cd cmake-out-android
 # build executorch & qnn_executorch_backend
 cmake .. \
     -DCMAKE_INSTALL_PREFIX=$PWD \
-    -DEXECUTORCH_BUILD_SDK=ON \
     -DEXECUTORCH_BUILD_QNN=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
-    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+    -DEXECUTORCH_BUILD_SDK=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
+    -DPYTHON_EXECUTABLE=python3 \
+    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
     -DANDROID_ABI='arm64-v8a' \
-    -DANDROID_NATIVE_API_LEVEL=23 \
-    -B$PWD
+    -DANDROID_NATIVE_API_LEVEL=23
 
-cmake --build $PWD -j16 --target install
+# nproc is used to detect the number of available CPU.
+# If it is not applicable, please feel free to use the number you want.
+cmake --build $PWD --target install -j$(nproc)
 
 cmake ../examples/qualcomm \
-    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
     -DANDROID_ABI='arm64-v8a' \
     -DANDROID_NATIVE_API_LEVEL=23 \
     -DCMAKE_PREFIX_PATH="$PWD/lib/cmake/ExecuTorch;$PWD/third-party/gflags;" \
     -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
+    -DPYTHON_EXECUTABLE=python3 \
     -Bexamples/qualcomm
 
-cmake --build examples/qualcomm -j16
+cmake --build examples/qualcomm -j$(nproc)
+
+# qnn_executor_runner can be found under examples/qualcomm
+# The full path is $EXECUTORCH_ROOT/cmake-out-android/examples/qualcomm/qnn_executor_runner
+ls examples/qualcomm
 ```
 
 **Note:** If you want to build for release, add `-DCMAKE_BUILD_TYPE=Release` to the `cmake` command options.
 
-You can find `qnn_executor_runner` under `build_android/examples/qualcomm/`.
-
-The build script is also available [here](https://github.com/pytorch/executorch/blob/main/backends/qualcomm/scripts/build.sh).
 
 ## Deploying and running on device
 
 ### AOT compile a model
 
-You can refer to [this script](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/scripts/deeplab_v3.py) for the exact flow.
+Refer to [this script](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/scripts/deeplab_v3.py) for the exact flow.
 We use deeplab-v3-resnet101 as an example in this tutorial. Run below commands to compile:
 
-```
+```bash
 cd $EXECUTORCH_ROOT
-python -m examples.qualcomm.scripts.deeplab_v3 -b build_android -m SM8550 --compile_only --download
+# Workaround for fbs files in exir/_serialize
+cp schema/program.fbs exir/_serialize/program.fbs
+cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
+
+python -m examples.qualcomm.scripts.deeplab_v3 -b cmake-out-android -m SM8550 --compile_only --download
 ```
 
 You might see something like below:
@@ -203,6 +232,58 @@ output         output                    output                       ([getitem_
 The compiled model is `./deeplab_v3/dlv3_qnn.pte`.
 
 
+### Test model inference on QNN HTP emulator
+
+We can test model inferences before deploying it to a device by HTP emulator.
+
+Let's build `qnn_executor_runner` for a x64 host:
+```bash
+# assuming the AOT component is built.
+cd $EXECUTORCH_ROOT/cmake-out
+cmake ../examples/qualcomm \
+  -DCMAKE_PREFIX_PATH="$PWD/lib/cmake/ExecuTorch;$PWD/third-party/gflags;" \
+  -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
+  -DPYTHON_EXECUTABLE=python3 \
+  -Bexamples/qualcomm
+
+cmake --build examples/qualcomm -j$(nproc)
+
+# qnn_executor_runner can be found under examples/qualcomm
+# The full path is $EXECUTORCH_ROOT/cmake-out/examples/qualcomm/qnn_executor_runner
+ls examples/qualcomm/
+```
+
+To run the HTP emulator, the dynamic linker need to access QNN libraries and `libqnn_executorch_backend.so`.
+We set the below two paths to `LD_LIBRARY_PATH` environment variable:
+  1. `$QNN_SDK_ROOT/lib/x86_64-linux-clang/`
+  2. `$EXECUTORCH_ROOT/cmake-out/lib/`
+
+The first path is for QNN libraries including HTP emulator. It has been configured in the AOT compilation section.
+
+The second path is for `libqnn_executorch_backend.so`.
+
+So, we can run `./deeplab_v3/dlv3_qnn.pte` by:
+```bash
+cd $EXECUTORCH_ROOT/cmake-out
+export LD_LIBRARY_PATH=$EXECUTORCH_ROOT/cmake-out/lib/:$LD_LIBRARY_PATH
+examples/qualcomm/qnn_executor_runner --model_path ../deeplab_v3/dlv3_qnn.pte
+```
+
+We should see some outputs like the below. Note that the emulator can take some time to finish.
+```bash
+I 00:00:00.354662 executorch:qnn_executor_runner.cpp:213] Method loaded.
+I 00:00:00.356460 executorch:qnn_executor_runner.cpp:261] ignoring error from set_output_data_ptr(): 0x2
+I 00:00:00.357991 executorch:qnn_executor_runner.cpp:261] ignoring error from set_output_data_ptr(): 0x2
+I 00:00:00.357996 executorch:qnn_executor_runner.cpp:265] Inputs prepared.
+
+I 00:01:09.328144 executorch:qnn_executor_runner.cpp:414] Model executed successfully.
+I 00:01:09.328159 executorch:qnn_executor_runner.cpp:421] Write etdump to etdump.etdp, Size = 424
+[INFO] [Qnn ExecuTorch]: Destroy Qnn backend parameters
+[INFO] [Qnn ExecuTorch]: Destroy Qnn context
+[INFO] [Qnn ExecuTorch]: Destroy Qnn device
+[INFO] [Qnn ExecuTorch]: Destroy Qnn backend
+```
+
 ### Run model inference on an Android smartphone with Qualcomm SoCs
 
 ***Step 1***. We need to push required QNN libraries to the device.
@@ -212,11 +293,13 @@ The compiled model is `./deeplab_v3/dlv3_qnn.pte`.
 DEVICE_DIR=/data/local/tmp/executorch_qualcomm_tutorial/
 adb shell "mkdir -p ${DEVICE_DIR}"
 adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtp.so ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so ${DEVICE_DIR}
 adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Stub.so ${DEVICE_DIR}
 adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV73Stub.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV75Stub.so ${DEVICE_DIR}
 adb push ${QNN_SDK_ROOT}/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${DEVICE_DIR}
 adb push ${QNN_SDK_ROOT}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${DEVICE_DIR}
 ```
 
 ***Step 2***.  We also need to indicate dynamic linkers on Android and Hexagon
@@ -225,8 +308,8 @@ So, we can run `qnn_executor_runner` like
 
 ```bash
 adb push ./deeplab_v3/dlv3_qnn.pte ${DEVICE_DIR}
-adb push ${EXECUTORCH_ROOT}/build_android/examples/qualcomm/qnn_executor_runner ${DEVICE_DIR}
-adb push ${EXECUTORCH_ROOT}/build_android/lib/libqnn_executorch_backend.so ${DEVICE_DIR}
+adb push ${EXECUTORCH_ROOT}/cmake-out-android/examples/qualcomm/executor_runner/qnn_executor_runner ${DEVICE_DIR}
+adb push ${EXECUTORCH_ROOT}/cmake-out-android/lib/libqnn_executorch_backend.so ${DEVICE_DIR}
 adb shell "cd ${DEVICE_DIR} \
            && export LD_LIBRARY_PATH=${DEVICE_DIR} \
            && export ADSP_LIBRARY_PATH=${DEVICE_DIR} \
@@ -236,24 +319,43 @@ adb shell "cd ${DEVICE_DIR} \
 You should see something like below:
 
 ```
-I 00:00:01.835706 executorch:qnn_executor_runner.cpp:298] 100 inference took 1096.626000 ms, avg 10.966260 ms
-[INFO][Qnn ExecuTorch] Destroy Qnn backend parameters
-[INFO][Qnn ExecuTorch] Destroy Qnn context
-[INFO][Qnn ExecuTorch] Destroy Qnn device
-[INFO][Qnn ExecuTorch] Destroy Qnn backend
+I 00:00:00.257354 executorch:qnn_executor_runner.cpp:213] Method loaded.
+I 00:00:00.323502 executorch:qnn_executor_runner.cpp:262] ignoring error from set_output_data_ptr(): 0x2
+I 00:00:00.357496 executorch:qnn_executor_runner.cpp:262] ignoring error from set_output_data_ptr(): 0x2
+I 00:00:00.357555 executorch:qnn_executor_runner.cpp:265] Inputs prepared.
+I 00:00:00.364824 executorch:qnn_executor_runner.cpp:414] Model executed successfully.
+I 00:00:00.364875 executorch:qnn_executor_runner.cpp:425] Write etdump to etdump.etdp, Size = 424
+[INFO] [Qnn ExecuTorch]: Destroy Qnn backend parameters
+[INFO] [Qnn ExecuTorch]: Destroy Qnn context
+[INFO] [Qnn ExecuTorch]: Destroy Qnn backend
 ```
 
+The model is merely executed. If we want to feed real inputs and get model outputs, we can use
+```bash
+cd $EXECUTORCH_ROOT
+python -m examples.qualcomm.scripts.deeplab_v3 -b cmake-out-android -m SM8550 --download -s <device_serial>
+```
+The `<device_serial>` can be found by `adb devices` command.
+
+After the above command, pre-processed inputs and outputs are put in `$EXECUTORCH_ROOT/deeplab_v3` and `$EXECUTORCH_ROOT/deeplab_v3/outputs` folder.
+
+The command-line arguents are written in [utils.py](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/scripts/utils.py#L127).
+The model, inputs, and output location are passed to `qnn_executorch_runner` by `--model_path`, `--input_list_path`, and `--output_folder_path`.
+
 
 ### Running a model via ExecuTorch's android demo-app
 
 An Android demo-app using Qualcomm AI Engine Direct Backend can be found in
 `examples`. Please refer to android demo app [tutorial](https://pytorch.org/executorch/stable/demo-apps-android.html).
 
+## Supported model list
+
+Please refer to `$EXECUTORCH_ROOT/examples/qualcomm/scripts/` and `EXECUTORCH_ROOT/examples/qualcomm/oss_scripts/` to the list of supported models.
 
 ## What is coming?
 
- - [An example using quantized mobilebert](https://github.com/pytorch/executorch/pull/1043) to solve multi-class text classification.
- - More Qualcomm AI Engine Direct accelerators, e.g., GPU.
+ - [llama2 and llama3](https://github.com/pytorch/executorch/pull/4030). Note that at the moment of writing, we still suffer from the quantization issue in llama2-7B and llama3-8B cases. Only storiesllama works well.
+ - We will support pre-compiled binaries from [Qualcomm AI Hub](https://aihub.qualcomm.com/).
 
 ## FAQ
 
diff --git a/examples/README.md b/examples/README.md
index 0b0ff0daf3..f36e873e84 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -63,9 +63,9 @@ The [`arm/`](./arm) directory contains scripts to help you run a PyTorch model o
 
 You will find demos of [ExecuTorch QNN Backend](./qualcomm) in the [`qualcomm/`](./qualcomm) directory.
 
-## Demo of ExecuTorch on Xtensa HiFi4 DSP
+## Demo of ExecuTorch on Cadence HiFi4 DSP
 
-The [`xtensa/`](./xtensa) directory hosts a demo that showcases the process of exporting and executing a model on Xtensa Hifi4 DSP. You can utilize [this tutorial](../docs/source/build-run-xtensa.md) to guide you in configuring the demo and running it.
+The [`Cadence/`](./cadence) directory hosts a demo that showcases the process of exporting and executing a model on Xtensa Hifi4 DSP. You can utilize [this tutorial](../docs/source/build-run-xtensa.md) to guide you in configuring the demo and running it.
 
 ## Dependencies
 
diff --git a/examples/apple/mps/CMakeLists.txt b/examples/apple/mps/CMakeLists.txt
index 9ae528668c..d1dd8e93d7 100644
--- a/examples/apple/mps/CMakeLists.txt
+++ b/examples/apple/mps/CMakeLists.txt
@@ -107,6 +107,10 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
     set(FLATCC_LIB flatccrt)
   endif()
 
+  if(CMAKE_BUILD_TYPE MATCHES "Debug")
+    target_link_options(mps_executor_runner PUBLIC -fsanitize=undefined)
+  endif()
+
   target_link_libraries(
     mps_executor_runner
     bundled_program
diff --git a/examples/demo-apps/android/ExecuTorchDemo/README.md b/examples/demo-apps/android/ExecuTorchDemo/README.md
index 1d993da3d4..89d8c34ee3 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/README.md
+++ b/examples/demo-apps/android/ExecuTorchDemo/README.md
@@ -53,7 +53,7 @@ For delegating to Qualcomm Hexagon NPU, please follow the tutorial [here](build-
 After generating the model, copy the model to `assets` directory.
 
 ```bash
-python -m examples.qualcomm.scripts.deeplab_v3 -b build_android -m SM8450 -s <adb_connected_device_serial>
+python -m examples.qualcomm.scripts.deeplab_v3 -b cmake-out-android -m SM8450 -s <adb_connected_device_serial>
 cp deeplab_v3/dlv3_qnn.pte examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/
 ```
 
diff --git a/examples/demo-apps/android/LlamaDemo/android-llama2-device-farm-test-spec.yml b/examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml
similarity index 81%
rename from examples/demo-apps/android/LlamaDemo/android-llama2-device-farm-test-spec.yml
rename to examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml
index 4df9f18cc5..cac83b8e6f 100644
--- a/examples/demo-apps/android/LlamaDemo/android-llama2-device-farm-test-spec.yml
+++ b/examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml
@@ -11,10 +11,10 @@ phases:
       # Prepare the model and the tokenizer
       - adb -s $DEVICEFARM_DEVICE_UDID shell "ls -la /sdcard/"
       - adb -s $DEVICEFARM_DEVICE_UDID shell "mkdir -p /data/local/tmp/llama/"
-      - adb -s $DEVICEFARM_DEVICE_UDID shell "mv /sdcard/tokenizer.bin /data/local/tmp/llama/tokenizer.bin"
-      - adb -s $DEVICEFARM_DEVICE_UDID shell "mv /sdcard/xnnpack_llama2.pte /data/local/tmp/llama/xnnpack_llama2.pte"
-      - adb -s $DEVICEFARM_DEVICE_UDID shell "chmod 664 /data/local/tmp/llama/tokenizer.bin"
-      - adb -s $DEVICEFARM_DEVICE_UDID shell "chmod 664 /data/local/tmp/llama/xnnpack_llama2.pte"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "mv /sdcard/*.bin /data/local/tmp/llama/"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "mv /sdcard/*.pte /data/local/tmp/llama/"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "chmod 664 /data/local/tmp/llama/*.bin"
+      - adb -s $DEVICEFARM_DEVICE_UDID shell "chmod 664 /data/local/tmp/llama/*.pte"
       - adb -s $DEVICEFARM_DEVICE_UDID shell "ls -la /data/local/tmp/llama/"
 
   test:
@@ -50,14 +50,8 @@ phases:
           false;
         elif [ $TESTS_FAILED -ne 0 ];
         then
-          OBSERVED_TPS=$(grep "The observed TPS " $INSTRUMENT_LOG | tail -n 1)
-
-          if [ -n "${OBSERVED_TPS}" ];
-          then
-            echo "[PyTorch] ${OBSERVED_TPS}";
-          else
-            echo "[PyTorch] Marking the test suite as failed because it failed to load the model";
-          fi
+          echo "[PyTorch] Marking the test suite as failed because it failed to load the model";
+          false;
         elif [ $TESTS_ERRORED -ne 0 ];
         then
           echo "[PyTorch] Marking the test suite as failed because $TESTS_ERRORED tests errored!";
@@ -66,6 +60,17 @@ phases:
         then
           echo "[PyTorch] Marking the test suite as failed because the app crashed due to OOM!";
           false;
+        # Check for this last to make sure that there is no failure
+        elif [ $TESTS_PASSED -ne 0 ];
+        then
+          OBSERVED_TPS=$(grep "INSTRUMENTATION_STATUS: TPS=" $INSTRUMENT_LOG | tail -n 1)
+
+          if [ -n "${OBSERVED_TPS}" ];
+          then
+            echo "[PyTorch] ${OBSERVED_TPS}";
+          else
+            echo "[PyTorch] Test passes but couldn't find the observed TPS from instrument log";
+          fi
         fi;
 
   post_test:
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java b/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java
index b8988d1f4b..221a9bd741 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java
@@ -8,12 +8,15 @@
 
 package com.example.executorchllamademo;
 
-import static junit.framework.TestCase.assertTrue;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 
+import android.os.Bundle;
 import androidx.test.ext.junit.runners.AndroidJUnit4;
+import androidx.test.platform.app.InstrumentationRegistry;
+import java.io.File;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 import org.junit.Test;
 import org.junit.runner.RunWith;
@@ -24,33 +27,35 @@
 public class PerfTest implements LlamaCallback {
 
   private static final String RESOURCE_PATH = "/data/local/tmp/llama/";
-  private static final String MODEL_NAME = "xnnpack_llama2.pte";
   private static final String TOKENIZER_BIN = "tokenizer.bin";
 
-  // From https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md
-  private static final Float EXPECTED_TPS = 10.0F;
-
   private final List<String> results = new ArrayList<>();
   private final List<Float> tokensPerSecond = new ArrayList<>();
 
   @Test
   public void testTokensPerSecond() {
-    String modelPath = RESOURCE_PATH + MODEL_NAME;
     String tokenizerPath = RESOURCE_PATH + TOKENIZER_BIN;
-    LlamaModule mModule = new LlamaModule(modelPath, tokenizerPath, 0.8f);
+    // Find out the model name
+    File directory = new File(RESOURCE_PATH);
+    Arrays.stream(directory.listFiles())
+        .filter(file -> file.getName().endsWith(".pte"))
+        .forEach(
+            model -> {
+              LlamaModule mModule = new LlamaModule(model.getPath(), tokenizerPath, 0.8f);
+              // Print the model name because there might be more than one of them
+              report("ModelName", model.getName());
 
-    int loadResult = mModule.load();
-    // Check that the model can be load successfully
-    assertEquals(0, loadResult);
+              int loadResult = mModule.load();
+              // Check that the model can be load successfully
+              assertEquals(0, loadResult);
 
-    // Run a testing prompt
-    mModule.generate("How do you do! I'm testing llama2 on mobile device", PerfTest.this);
-    assertFalse(tokensPerSecond.isEmpty());
+              // Run a testing prompt
+              mModule.generate("How do you do! I'm testing llama2 on mobile device", PerfTest.this);
+              assertFalse(tokensPerSecond.isEmpty());
 
-    final Float tps = tokensPerSecond.get(tokensPerSecond.size() - 1);
-    assertTrue(
-        "The observed TPS " + tps + " is less than the expected TPS " + EXPECTED_TPS,
-        tps >= EXPECTED_TPS);
+              final Float tps = tokensPerSecond.get(tokensPerSecond.size() - 1);
+              report("TPS", tps);
+            });
   }
 
   @Override
@@ -62,4 +67,16 @@ public void onResult(String result) {
   public void onStats(float tps) {
     tokensPerSecond.add(tps);
   }
+
+  private void report(final String metric, final Float value) {
+    Bundle bundle = new Bundle();
+    bundle.putFloat(metric, value);
+    InstrumentationRegistry.getInstrumentation().sendStatus(0, bundle);
+  }
+
+  private void report(final String key, final String value) {
+    Bundle bundle = new Bundle();
+    bundle.putString(key, value);
+    InstrumentationRegistry.getInstrumentation().sendStatus(0, bundle);
+  }
 }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml
index bb231420df..02d8503a4d 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml
@@ -47,6 +47,15 @@
                 <category android:name="android.intent.category.LAUNCHER" />
             </intent-filter>
         </activity>
+
+        <activity
+            android:name=".LlmBenchmarkRunner"
+            android:exported="true">
+            <intent-filter>
+                <action android:name="com.example.executorchllamademo.BENCHMARK" />
+            </intent-filter>
+        </activity>
+
     </application>
 
 </manifest>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java
new file mode 100644
index 0000000000..33b230b1df
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package com.example.executorchllamademo;
+
+import android.app.Activity;
+import android.content.Intent;
+import android.os.Bundle;
+import android.util.Log;
+import android.widget.TextView;
+import androidx.annotation.NonNull;
+import java.io.FileWriter;
+import java.io.IOException;
+
+public class LlmBenchmarkRunner extends Activity implements ModelRunnerCallback {
+  ModelRunner mModelRunner;
+
+  String mPrompt;
+  TextView mTextView;
+  StatsDump mStatsDump;
+
+  @Override
+  protected void onCreate(Bundle savedInstanceState) {
+    super.onCreate(savedInstanceState);
+    setContentView(R.layout.activity_benchmarking);
+    mTextView = findViewById(R.id.log_view);
+
+    Intent intent = getIntent();
+
+    String modelPath = intent.getStringExtra("model_path");
+    String tokenizerPath = intent.getStringExtra("tokenizer_path");
+
+    float temperature = intent.getFloatExtra("temperature", 0.8f);
+    mPrompt = intent.getStringExtra("prompt");
+    if (mPrompt == null) {
+      mPrompt = "The ultimate answer";
+    }
+
+    mStatsDump = new StatsDump();
+    mModelRunner = new ModelRunner(modelPath, tokenizerPath, temperature, this);
+    mStatsDump.loadStart = System.currentTimeMillis();
+  }
+
+  @Override
+  public void onModelLoaded(int status) {
+    mStatsDump.loadEnd = System.currentTimeMillis();
+    if (status != 0) {
+      Log.e("LlmBenchmarkRunner", "Loaded failed: " + status);
+      onGenerationStopped();
+      return;
+    }
+    mStatsDump.generateStart = System.currentTimeMillis();
+    mModelRunner.generate(mPrompt);
+  }
+
+  @Override
+  public void onTokenGenerated(String token) {
+    runOnUiThread(
+        () -> {
+          mTextView.append(token);
+        });
+  }
+
+  @Override
+  public void onStats(String stats) {
+    mStatsDump.tokens = stats;
+  }
+
+  @Override
+  public void onGenerationStopped() {
+    mStatsDump.generateEnd = System.currentTimeMillis();
+    runOnUiThread(
+        () -> {
+          mTextView.append(mStatsDump.toString());
+        });
+
+    try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.txt")) {
+      writer.write(mStatsDump.toString());
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+  }
+}
+
+class StatsDump {
+  long loadStart;
+  long loadEnd;
+  long generateStart;
+  long generateEnd;
+  String tokens;
+
+  @NonNull
+  @Override
+  public String toString() {
+    return "loadStart: "
+        + loadStart
+        + "\nloadEnd: "
+        + loadEnd
+        + "\ngenerateStart: "
+        + generateStart
+        + "\ngenerateEnd: "
+        + generateEnd
+        + "\n"
+        + tokens;
+  }
+}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunner.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunner.java
new file mode 100644
index 0000000000..4dc32d1475
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunner.java
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package com.example.executorchllamademo;
+
+import android.os.Handler;
+import android.os.HandlerThread;
+import android.os.Looper;
+import android.os.Message;
+import androidx.annotation.NonNull;
+import org.pytorch.executorch.LlamaCallback;
+import org.pytorch.executorch.LlamaModule;
+
+/** A helper class to handle all model running logic within this class. */
+public class ModelRunner implements LlamaCallback {
+  LlamaModule mModule = null;
+
+  String mModelFilePath = "";
+  String mTokenizerFilePath = "";
+
+  ModelRunnerCallback mCallback = null;
+
+  HandlerThread mHandlerThread = null;
+  Handler mHandler = null;
+
+  /**
+   * ] Helper class to separate between UI logic and model runner logic. Automatically handle
+   * generate() request on worker thread.
+   *
+   * @param modelFilePath
+   * @param tokenizerFilePath
+   * @param callback
+   */
+  ModelRunner(
+      String modelFilePath,
+      String tokenizerFilePath,
+      float temperature,
+      ModelRunnerCallback callback) {
+    mModelFilePath = modelFilePath;
+    mTokenizerFilePath = tokenizerFilePath;
+    mCallback = callback;
+
+    mModule = new LlamaModule(mModelFilePath, mTokenizerFilePath, 0.8f);
+    mHandlerThread = new HandlerThread("ModelRunner");
+    mHandlerThread.start();
+    mHandler = new ModelRunnerHandler(mHandlerThread.getLooper(), this);
+
+    mHandler.sendEmptyMessage(ModelRunnerHandler.MESSAGE_LOAD_MODEL);
+  }
+
+  int generate(String prompt) {
+    Message msg = Message.obtain(mHandler, ModelRunnerHandler.MESSAGE_GENERATE, prompt);
+    msg.sendToTarget();
+    return 0;
+  }
+
+  void stop() {
+    mModule.stop();
+  }
+
+  @Override
+  public void onResult(String result) {
+    mCallback.onTokenGenerated(result);
+  }
+
+  @Override
+  public void onStats(float tps) {
+    mCallback.onStats("tokens/second: " + tps);
+  }
+}
+
+class ModelRunnerHandler extends Handler {
+  public static int MESSAGE_LOAD_MODEL = 1;
+  public static int MESSAGE_GENERATE = 2;
+
+  private final ModelRunner mModelRunner;
+
+  public ModelRunnerHandler(Looper looper, ModelRunner modelRunner) {
+    super(looper);
+    mModelRunner = modelRunner;
+  }
+
+  @Override
+  public void handleMessage(@NonNull android.os.Message msg) {
+    if (msg.what == MESSAGE_LOAD_MODEL) {
+      int status = mModelRunner.mModule.load();
+      mModelRunner.mCallback.onModelLoaded(status);
+    } else if (msg.what == MESSAGE_GENERATE) {
+      mModelRunner.mModule.generate((String) msg.obj, mModelRunner);
+      mModelRunner.mCallback.onGenerationStopped();
+    }
+  }
+}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunnerCallback.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunnerCallback.java
new file mode 100644
index 0000000000..c8bdc53075
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunnerCallback.java
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package com.example.executorchllamademo;
+
+/**
+ * A helper interface within the app for MainActivity and Benchmarking to handle callback from
+ * ModelRunner.
+ */
+public interface ModelRunnerCallback {
+
+  void onModelLoaded(int status);
+
+  void onTokenGenerated(String token);
+
+  void onStats(String token);
+
+  void onGenerationStopped();
+}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_benchmarking.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_benchmarking.xml
new file mode 100644
index 0000000000..6e48b5de8b
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_benchmarking.xml
@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="utf-8"?>
+<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:orientation="vertical"
+    android:clipToPadding="false"
+    android:focusableInTouchMode="true"
+    tools:context=".LlmBenchmarkRunner">
+
+    <TextView
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:id="@+id/log_view" />
+
+</LinearLayout>
diff --git a/examples/llm_manual/managed_tensor.h b/examples/llm_manual/managed_tensor.h
index d401ae4d18..d870f4861e 100644
--- a/examples/llm_manual/managed_tensor.h
+++ b/examples/llm_manual/managed_tensor.h
@@ -30,28 +30,21 @@ class ManagedTensor {
   using DimOrderType = exec_aten::DimOrderType;
   /// The type used for elements of `strides()`.
   using StridesType = exec_aten::StridesType;
+
   ManagedTensor() = delete;
 
   explicit ManagedTensor(
       void* data,
       const std::vector<SizesType>& sizes,
       ScalarType dtype)
-      : dtype_(dtype), sizes_(sizes), data_ptr_(data) {
-    ssize_t dim = sizes.size();
-    dim_order_.resize(dim);
-    strides_.resize(dim);
-    for (size_t i = 0; i < dim; ++i) {
-      dim_order_[i] = i;
-    }
-    dim_order_to_stride_nocheck(
-        sizes.data(), dim_order_.data(), dim, strides_.data());
+      : sizes_(sizes) {
     tensor_impl_ = std::make_unique<TensorImpl>(
-        dtype_,
-        dim,
+        dtype,
+        sizes_.size(),
         sizes_.data(),
-        data_ptr_,
-        dim_order_.data(),
-        strides_.data(),
+        data,
+        nullptr,
+        nullptr,
         TensorShapeDynamism::DYNAMIC_BOUND);
   }
 
@@ -63,12 +56,9 @@ class ManagedTensor {
   }
 
  private:
-  void* data_ptr_ = nullptr;
   std::unique_ptr<TensorImpl> tensor_impl_;
   std::vector<SizesType> sizes_;
-  std::vector<StridesType> strides_;
-  std::vector<DimOrderType> dim_order_;
-  ScalarType dtype_;
 };
+
 } // namespace executor
 } // namespace torch
diff --git a/examples/models/flamingo/__init__.py b/examples/models/flamingo/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/examples/models/flamingo/export_preprocess.py b/examples/models/flamingo/export_preprocess.py
new file mode 100644
index 0000000000..c5a930c88c
--- /dev/null
+++ b/examples/models/flamingo/export_preprocess.py
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from export_preprocess_lib import export_preprocess, lower_to_executorch_preprocess
+
+
+def main():
+    ep = export_preprocess()
+    et = lower_to_executorch_preprocess(ep)
+
+    with open("preprocess.pte", "wb") as file:
+        et.write_to_file(file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/models/flamingo/export_preprocess_lib.py b/examples/models/flamingo/export_preprocess_lib.py
new file mode 100644
index 0000000000..736116de8b
--- /dev/null
+++ b/examples/models/flamingo/export_preprocess_lib.py
@@ -0,0 +1,87 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from executorch.exir import EdgeCompileConfig, ExecutorchBackendConfig, to_edge
+from executorch.exir.program._program import ExecutorchProgramManager
+
+from executorch.extension.llm.custom_ops import preprocess_custom_ops  # noqa
+
+from torch.export import Dim, ExportedProgram
+from torchtune.models.clip.inference._transforms import _CLIPImageTransform
+
+from .passes.replace_custom_ops_with_aten_ops_pass import (
+    ReplaceCustomOpsWithAtenOpsPass,
+)
+
+
+def get_example_inputs() -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    image = torch.ones(3, 800, 600)
+    target_size = torch.tensor([448, 336])
+    canvas_size = torch.tensor([448, 448])
+    return (image, target_size, canvas_size)
+
+
+def get_dynamic_shapes() -> Dict[str, Dict[int, Dim]]:
+    img_h = Dim("img_h", min=1, max=4000)
+    img_w = Dim("img_w", min=1, max=4000)
+
+    dynamic_shapes = {
+        "image": {1: img_h, 2: img_w},
+        "target_size": None,
+        "canvas_size": None,
+    }
+    return dynamic_shapes
+
+
+def export_preprocess(
+    resample: str = "bilinear",
+    image_mean: Optional[List[float]] = None,
+    image_std: Optional[List[float]] = None,
+    max_num_tiles: int = 4,
+    tile_size: int = 224,
+    antialias: bool = False,
+) -> ExportedProgram:
+
+    # Instantiate eager model.
+    image_transform_model = _CLIPImageTransform(
+        resample=resample,
+        image_mean=image_mean,
+        image_std=image_std,
+        max_num_tiles=max_num_tiles,
+        tile_size=tile_size,
+        antialias=antialias,
+    )
+
+    # Replace non-exportable ops with custom ops.
+    image_transform_model.pad = torch.ops.preprocess.pad.default
+    image_transform_model.tile_crop = torch.ops.preprocess.tile_crop.default
+
+    # Export.
+    example_inputs = get_example_inputs()
+    dynamic_shapes = get_dynamic_shapes()
+    ep = torch.export.export(
+        image_transform_model,
+        example_inputs,
+        dynamic_shapes=dynamic_shapes,
+        strict=False,
+    )
+    return ep
+
+
+def lower_to_executorch_preprocess(
+    exported_program: ExportedProgram,
+) -> ExecutorchProgramManager:
+    edge_program = to_edge(
+        exported_program, compile_config=EdgeCompileConfig(_check_ir_validity=False)
+    )
+    # Replace custom ops with aten ops.
+    edge_program = edge_program.transform([ReplaceCustomOpsWithAtenOpsPass()])
+
+    et_program = edge_program.to_executorch(ExecutorchBackendConfig())
+    return et_program
diff --git a/examples/models/flamingo/install_requirements.sh b/examples/models/flamingo/install_requirements.sh
new file mode 100644
index 0000000000..0bcf302ca9
--- /dev/null
+++ b/examples/models/flamingo/install_requirements.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Install torchtune nightly for model definitions.
+pip install --pre torchtune --extra-index-url https://download.pytorch.org/whl/nightly/cpu --no-cache-dir
diff --git a/examples/models/flamingo/passes/__init__.py b/examples/models/flamingo/passes/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/examples/models/flamingo/passes/replace_custom_ops_with_aten_ops_pass.py b/examples/models/flamingo/passes/replace_custom_ops_with_aten_ops_pass.py
new file mode 100644
index 0000000000..8c31cf512c
--- /dev/null
+++ b/examples/models/flamingo/passes/replace_custom_ops_with_aten_ops_pass.py
@@ -0,0 +1,31 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import torch
+from executorch.exir.pass_base import ExportPass
+from executorch.extension.llm.custom_ops import preprocess_custom_ops  # noqa
+
+
+class ReplaceCustomOpsWithAtenOpsPass(ExportPass):
+    """
+    Goes through all ops and replaces custom ops with aten ops. In some cases
+    aten ops cannot be exported due to dynamism, eg. pad in flamingo preprocess.
+    Use a custom op to pass export, and replace it with the aten op post-export,
+    which avoids re-writing the op in C++.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op._name == "preprocess::pad":
+            return super().call_operator(
+                torch.ops.aten.constant_pad_nd.default, args, kwargs, meta
+            )
+
+        return super().call_operator(op, args, kwargs, meta)
diff --git a/examples/models/flamingo/passes/test_passes.py b/examples/models/flamingo/passes/test_passes.py
new file mode 100644
index 0000000000..d0a90f2e34
--- /dev/null
+++ b/examples/models/flamingo/passes/test_passes.py
@@ -0,0 +1,50 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import unittest
+
+from typing import List
+
+import torch
+from executorch.exir import EdgeCompileConfig, to_edge
+
+from .replace_custom_ops_with_aten_ops_pass import ReplaceCustomOpsWithAtenOpsPass
+
+
+class TestPasses(unittest.TestCase):
+    def test_replace_custom_ops_with_aten_ops_pass(self) -> None:
+        from executorch.extension.llm.custom_ops import preprocess_custom_ops  # noqa
+
+        class Pad(torch.nn.Module):
+            def forward(self, x: torch.Tensor, padding: List[int]) -> torch.Tensor:
+                return torch.ops.preprocess.pad.default(x, padding)
+
+        pad = Pad()
+
+        image_tensor = torch.ones([3, 4, 5])
+        padding = [0, 2, 0, 1]
+
+        edge_prog = to_edge(
+            torch.export.export(pad, (image_tensor, padding), strict=False),
+            compile_config=EdgeCompileConfig(_check_ir_validity=False),
+        )
+
+        # Check that the custom op exists in the graph, and aten op does not.
+        edge_nodes = [node.name for node in edge_prog.exported_program().graph.nodes]
+        assert "constant_pad_nd" not in edge_nodes
+        assert "preprocess_pad_default" in edge_nodes
+
+        edge_prog = edge_prog.transform([ReplaceCustomOpsWithAtenOpsPass()])
+
+        # After running replace_custom_ops_with_aten_ops pass, the custom op
+        # should be replaced with aten op.
+        post_transform_nodes = [
+            node.name for node in edge_prog.exported_program().graph.nodes
+        ]
+        assert "constant_pad_nd" in post_transform_nodes
+        assert "preprocess_pad_default" not in post_transform_nodes
diff --git a/examples/models/flamingo/test_preprocess.py b/examples/models/flamingo/test_preprocess.py
new file mode 100644
index 0000000000..896a01655e
--- /dev/null
+++ b/examples/models/flamingo/test_preprocess.py
@@ -0,0 +1,244 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+import numpy as np
+import PIL
+import torch
+
+from parameterized import parameterized
+from PIL import Image
+
+from torchtune.models.clip.inference._transforms import (
+    _CLIPImageTransform,
+    CLIPImageTransform,
+)
+
+from torchtune.modules.transforms import (
+    find_supported_resolutions,
+    get_canvas_best_fit,
+    get_inscribed_size,
+)
+from torchvision.transforms.v2 import functional as F
+
+from .export_preprocess_lib import export_preprocess
+
+
+@dataclass
+class PreprocessConfig:
+    image_mean: Optional[List[float]] = None
+    image_std: Optional[List[float]] = None
+    resize_to_max_canvas: bool = True
+    resample: str = "bilinear"
+    antialias: bool = False
+    tile_size: int = 224
+    max_num_tiles: int = 4
+    possible_resolutions = None
+
+
+class TestImageTransform(unittest.TestCase):
+    """
+    This unittest checks that the exported image transform model produces the
+    same output as the reference model.
+
+    Reference model: CLIPImageTransform
+        https://github.com/pytorch/torchtune/blob/main/torchtune/models/clip/inference/_transforms.py#L115
+    Eager and exported models: _CLIPImageTransform
+        https://github.com/pytorch/torchtune/blob/main/torchtune/models/clip/inference/_transforms.py#L26
+    """
+
+    def setUp(self):
+        np.random.seed(0)
+
+    def prepare_inputs(
+        self, image: Image.Image, config: PreprocessConfig
+    ) -> Tuple[torch.Tensor]:
+        """
+        Prepare inputs for eager and exported models:
+        - Convert PIL image to tensor.
+        - Calculate the best resolution; a canvas with height and width divisible by tile_size.
+        - Calculate the inscribed size; the size of the image inscribed within best_resolution,
+            without distortion.
+
+        These calculations are done by the reference model inside __init__ and __call__
+        https://github.com/pytorch/torchtune/blob/main/torchtune/models/clip/inference/_transforms.py#L115
+        """
+        image_tensor = F.to_dtype(
+            F.grayscale_to_rgb_image(F.to_image(image)), scale=True
+        )
+
+        # Calculate possible resolutions.
+        possible_resolutions = config.possible_resolutions
+        if possible_resolutions is None:
+            possible_resolutions = find_supported_resolutions(
+                max_num_tiles=config.max_num_tiles, tile_size=config.tile_size
+            )
+        possible_resolutions = torch.tensor(possible_resolutions).reshape(-1, 2)
+
+        # Limit resizing.
+        max_size = None if config.resize_to_max_canvas else config.tile_size
+
+        # Find the best canvas to fit the image without distortion.
+        best_resolution = get_canvas_best_fit(
+            image=image_tensor,
+            possible_resolutions=possible_resolutions,
+            resize_to_max_canvas=config.resize_to_max_canvas,
+        )
+        best_resolution = torch.tensor(best_resolution)
+
+        # Find the dimensions of the image, such that it is inscribed within best_resolution
+        # without distortion.
+        inscribed_size = get_inscribed_size(
+            image_tensor.shape[-2:], best_resolution, max_size
+        )
+        inscribed_size = torch.tensor(inscribed_size)
+
+        return image_tensor, inscribed_size, best_resolution
+
+    # This test setup mirrors the one in torchtune:
+    # https://github.com/pytorch/torchtune/blob/main/tests/torchtune/models/clip/test_clip_image_transform.py
+    # The values are slightly different, as torchtune uses antialias=True,
+    # and this test uses antialias=False, which is exportable (has a portable kernel).
+    @parameterized.expand(
+        [
+            (
+                (100, 400, 3),  # image_size
+                torch.Size([2, 3, 224, 224]),  # expected shape
+                False,  # resize_to_max_canvas
+                [0.2230, 0.1763],  # expected_tile_means
+                [1.0, 1.0],  # expected_tile_max
+                [0.0, 0.0],  # expected_tile_min
+                [1, 2],  # expected_aspect_ratio
+            ),
+            (
+                (1000, 300, 3),  # image_size
+                torch.Size([4, 3, 224, 224]),  # expected shape
+                True,  # resize_to_max_canvas
+                [0.5005, 0.4992, 0.5004, 0.1651],  # expected_tile_means
+                [0.9976, 0.9940, 0.9936, 0.9906],  # expected_tile_max
+                [0.0037, 0.0047, 0.0039, 0.0],  # expected_tile_min
+                [4, 1],  # expected_aspect_ratio
+            ),
+            (
+                (200, 200, 3),  # image_size
+                torch.Size([4, 3, 224, 224]),  # expected shape
+                True,  # resize_to_max_canvas
+                [0.5012, 0.5020, 0.5010, 0.4991],  # expected_tile_means
+                [0.9921, 0.9925, 0.9969, 0.9908],  # expected_tile_max
+                [0.0056, 0.0069, 0.0059, 0.0032],  # expected_tile_min
+                [2, 2],  # expected_aspect_ratio
+            ),
+            (
+                (600, 200, 3),  # image_size
+                torch.Size([3, 3, 224, 224]),  # expected shape
+                False,  # resize_to_max_canvas
+                [0.4472, 0.4468, 0.3031],  # expected_tile_means
+                [1.0, 1.0, 1.0],  # expected_tile_max
+                [0.0, 0.0, 0.0],  # expected_tile_min
+                [3, 1],  # expected_aspect_ratio
+            ),
+        ]
+    )
+    def test_preprocess(
+        self,
+        image_size: Tuple[int],
+        expected_shape: torch.Size,
+        resize_to_max_canvas: bool,
+        expected_tile_means: List[float],
+        expected_tile_max: List[float],
+        expected_tile_min: List[float],
+        expected_ar: List[int],
+    ) -> None:
+        config = PreprocessConfig(resize_to_max_canvas=resize_to_max_canvas)
+
+        reference_model = CLIPImageTransform(
+            image_mean=config.image_mean,
+            image_std=config.image_std,
+            resize_to_max_canvas=config.resize_to_max_canvas,
+            resample=config.resample,
+            antialias=config.antialias,
+            tile_size=config.tile_size,
+            max_num_tiles=config.max_num_tiles,
+            possible_resolutions=None,
+        )
+
+        eager_model = _CLIPImageTransform(
+            image_mean=config.image_mean,
+            image_std=config.image_std,
+            resample=config.resample,
+            antialias=config.antialias,
+            tile_size=config.tile_size,
+            max_num_tiles=config.max_num_tiles,
+        )
+
+        exported_model = export_preprocess(
+            image_mean=config.image_mean,
+            image_std=config.image_std,
+            resample=config.resample,
+            antialias=config.antialias,
+            tile_size=config.tile_size,
+            max_num_tiles=config.max_num_tiles,
+        )
+
+        # Prepare image input.
+        image = (
+            np.random.randint(0, 256, np.prod(image_size))
+            .reshape(image_size)
+            .astype(np.uint8)
+        )
+        image = PIL.Image.fromarray(image)
+
+        # Run reference model.
+        reference_output = reference_model(image=image)
+        reference_image = reference_output["image"]
+        reference_ar = reference_output["aspect_ratio"].tolist()
+
+        # Check output shape and aspect ratio matches expected values.
+        self.assertEqual(reference_image.shape, expected_shape)
+        self.assertEqual(reference_ar, expected_ar)
+
+        # Check pixel values within expected range [0, 1]
+        self.assertTrue(0 <= reference_image.min() <= reference_image.max() <= 1)
+
+        # Check mean, max, and min values of the tiles match expected values.
+        for i, tile in enumerate(reference_image):
+            self.assertAlmostEqual(
+                tile.mean().item(), expected_tile_means[i], delta=1e-4
+            )
+            self.assertAlmostEqual(tile.max().item(), expected_tile_max[i], delta=1e-4)
+            self.assertAlmostEqual(tile.min().item(), expected_tile_min[i], delta=1e-4)
+
+        # Check num tiles matches the product of the aspect ratio.
+        expected_num_tiles = reference_ar[0] * reference_ar[1]
+        self.assertEqual(expected_num_tiles, reference_image.shape[0])
+
+        # Pre-work for eager and exported models. The reference model performs these
+        # calculations and passes the result to _CLIPImageTransform, the exportable model.
+        image_tensor, inscribed_size, best_resolution = self.prepare_inputs(
+            image=image, config=config
+        )
+
+        # Run eager and exported models.
+        eager_image, eager_ar = eager_model(
+            image_tensor, inscribed_size, best_resolution
+        )
+        eager_ar = eager_ar.tolist()
+
+        exported_image, exported_ar = exported_model.module()(
+            image_tensor, inscribed_size, best_resolution
+        )
+        exported_ar = exported_ar.tolist()
+
+        # Check eager and exported models match reference model.
+        self.assertTrue(torch.allclose(reference_image, eager_image))
+        self.assertTrue(torch.allclose(reference_image, exported_image))
+
+        self.assertTrue(reference_ar, eager_ar)
+        self.assertTrue(reference_ar, exported_ar)
diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md
index 0ab35b2c50..96302b05f8 100644
--- a/examples/models/llama2/README.md
+++ b/examples/models/llama2/README.md
@@ -210,6 +210,7 @@ The Wikitext results generated above used: `{max_seq_len: 2048, limit: 1000}`
 
     cmake --build cmake-out -j16 --target install --config Release
     ```
+Note for Mac users: There's a known linking issue with Xcode 15.1. Refer to the session of Common Issues and Mitigations below for solutions.
 
 2. Build llama runner.
     ```
@@ -353,3 +354,24 @@ pip uninstall executorch
 rm -rf cmake-out
 ```
 - If you encounter `pthread` related issues during link time, add `pthread` in `target_link_libraries` in `CMakeLists.txt`
+- On Mac, if there is linking error in Step 4 with error message like
+```
+0  0x100823648  __assert_rtn + 72
+1  0x10074bc5c  ld::Fixup::applyFixup(ld::Atom const*, ld::LayoutLinkedImage const&, unsigned char*) const + 8268
+2  0x1007de7d8  ___ZN2ld16LayoutExecutable27writeContentWithoutLinkEditENSt3__14spanIhLm18446744073709551615EEEy_block_invoke + 332
+3  0x188cca428  _dispatch_client_callout2 + 20
+4  0x188cde850  _dispatch_apply_invoke3 + 336
+5  0x188cca3e8  _dispatch_client_callout + 20
+6  0x188ccbc68  _dispatch_once_callout + 32
+7  0x188cdeeec  _dispatch_apply_invoke_and_wait + 372
+8  0x188cdde9c  _dispatch_apply_with_attr_f + 1212
+9  0x188cde08c  dispatch_apply + 96
+10  0x1007de9e4  void mapReduce<ld::Atom const*, mach_o::Error>(std::__1::span<ld::Atom const*, 18446744073709551615ul>, unsigned long, void (unsigned long, mach_o::Error&, std::__1::span<ld::Atom const*, 18446744073709551615ul>) block_pointer, void (std::__1::span<mach_o::Error, 18446744073709551615ul>) block_pointer) + 336
+11  0x1007de594  ld::LayoutExecutable::writeContentWithoutLinkEdit(std::__1::span<unsigned char, 18446744073709551615ul>, unsigned long long) + 1180
+12  0x1007e4020  ld::LayoutExecutable::writeToFile(char const*) + 15248
+13  0x1007962e8  main + 9424
+ld: Assertion failed: (extras.otherInstrOffset != 0 && "Kind::arm64_adrp_ldr missing extra info"), function applyFixup, file Fixup.cpp, line 793.
+clang: error: linker command failed with exit code 1 (use -v to see invocation)
+```
+It's a known issue for Xcode version 15.1.
+Mitigation: update to most recent Xcode version, clean and rebuild.
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
index eeafa3dee3..56ca1f5873 100644
--- a/examples/models/llama2/export_llama_lib.py
+++ b/examples/models/llama2/export_llama_lib.py
@@ -553,27 +553,29 @@ def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
 
 def _load_llama_model_metadata(
     weight_type: WeightType,
-    dtype: DType,
     use_kv_cache: bool,
     use_sdpa_with_kv_cache: bool,
     enable_dynamic_shape: bool,
-    modelArgs: ModelArgs,
+    model_args: ModelArgs,
     metadata_str: Optional[str] = None,
 ):
     is_fairseq2 = weight_type == WeightType.FAIRSEQ2
     metadata = {
         "append_eos_to_prompt": is_fairseq2,  # For language llama, tell the runtime to always append EOS token(s) to prompt.
-        "get_bos_id": 3 if is_fairseq2 else 1,
-        "get_dtype": 5 if dtype == DType.fp16 else 6,
-        "get_eos_id": 3 if is_fairseq2 else 2,
-        "get_head_dim": modelArgs.dim // modelArgs.n_heads,
-        "get_max_batch_size": modelArgs.max_batch_size,
-        "get_max_seq_len": modelArgs.max_seq_len,
+        "get_bos_id": (
+            model_args.bos_idx
+            if model_args.bos_idx is not None
+            else (3 if is_fairseq2 else 1)
+        ),
+        "get_eos_id": (
+            model_args.eos_idx
+            if model_args.eos_idx is not None
+            else (3 if is_fairseq2 else 2)
+        ),
+        "get_max_seq_len": model_args.max_seq_len,
         "get_n_bos": 1,
         "get_n_eos": 2 if is_fairseq2 else 1,
-        "get_n_kv_heads": modelArgs.n_kv_heads,
-        "get_n_layers": modelArgs.n_layers,
-        "get_vocab_size": modelArgs.vocab_size,
+        "get_vocab_size": model_args.vocab_size,
         "use_kv_cache": use_kv_cache,
         "use_sdpa_with_kv_cache": use_sdpa_with_kv_cache,
         "enable_dynamic_shape": enable_dynamic_shape,
@@ -655,7 +657,6 @@ def _load_llama_model(
         verbose=verbose,
         metadata=_load_llama_model_metadata(
             weight_type,
-            dtype,
             use_kv_cache,
             use_sdpa_with_kv_cache,
             enable_dynamic_shape,
diff --git a/examples/models/llama2/install_requirements.sh b/examples/models/llama2/install_requirements.sh
index d316790d57..6b106c1c21 100755
--- a/examples/models/llama2/install_requirements.sh
+++ b/examples/models/llama2/install_requirements.sh
@@ -12,7 +12,8 @@ pip install torchao==0.1
 
 # Install lm-eval for Model Evaluation with lm-evalution-harness
 # Install tiktoken for tokenizer
-pip install lm-eval tiktoken blobfile
+pip install lm_eval==0.4.2
+pip install tiktoken blobfile
 
 # Call the install helper for further setup
 python examples/models/llama2/install_requirement_helper.py
diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama2/llama_transformer.py
index dacf9eb1fd..99544426fd 100644
--- a/examples/models/llama2/llama_transformer.py
+++ b/examples/models/llama2/llama_transformer.py
@@ -104,8 +104,8 @@ class ModelArgs:
     rope_freq_base: float = 10000.0  # The base frequency for RoPE. Keep it for BC.
     use_scaled_rope: bool = False  # Use scaled RoPE, introduced in llama3.1.
     # Additional Model Metadata needed at runtime
-    bos_idx: int = 1
-    eos_idx: int = 3
+    bos_idx: Optional[int] = None
+    eos_idx: Optional[int] = None
     bos_count: int = -1  # i.e., a single EOS is used as BOS
     eos_count: int = 2
 
diff --git a/examples/models/llama2/runner/eager.py b/examples/models/llama2/runner/eager.py
new file mode 100644
index 0000000000..d246a2df21
--- /dev/null
+++ b/examples/models/llama2/runner/eager.py
@@ -0,0 +1,115 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import json
+from typing import Optional
+
+import torch
+
+from examples.models.llama2.llama_transformer import ModelArgs
+from executorch.examples.models.model_factory import EagerModelFactory
+
+from .generation import LlamaRunner
+
+
+class EagerLlamaRunner(LlamaRunner):
+    """
+    Runs llama in eager mode with provided checkpoint file.
+    """
+
+    def __init__(self, args):
+        with open(args.params, "r") as f:
+            params = json.loads(f.read())
+        model_args: ModelArgs = ModelArgs(
+            max_seq_len=args.max_len,
+            max_batch_size=1,
+            use_kv_cache=True,
+            **params,
+        )
+        super().__init__(tokenizer_path=args.tokenizer, model_args=model_args)
+        self.model, _, _ = EagerModelFactory.create_model(
+            "llama2",
+            "Llama2Model",
+            checkpoint=args.checkpoint,
+            params=args.params,
+            use_kv_cache=True,
+            fairseq2=False,
+            max_seq_len=args.max_len,
+            enable_dynamic_shape=True,
+        )
+
+    def forward(
+        self,
+        tokens: Optional[torch.LongTensor] = None,
+        input_pos: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        return self.model.forward(tokens=tokens, input_pos=input_pos)
+
+
+def build_args_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        default=None,
+        help="path to model checkpoint file",
+    )
+
+    parser.add_argument(
+        "--params",
+        type=str,
+        default=None,
+        help="model params file",
+    )
+
+    parser.add_argument(
+        "--max_len",
+        type=int,
+        default=128,
+        help="Maximum length of the generated response sequence.",
+    )
+
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        default=None,
+    )
+
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="Hello",
+    )
+
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0,
+    )
+
+    return parser
+
+
+def main() -> None:
+    parser = build_args_parser()
+    args = parser.parse_args()
+
+    runner = EagerLlamaRunner(args)
+    result = runner.text_completion(
+        prompt=args.prompt,
+        temperature=args.temperature,
+    )
+    print(
+        "Response: \n{response}\n Tokens:\n {tokens}".format(
+            response=result["generation"], tokens=result["tokens"]
+        )
+    )
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover
diff --git a/examples/models/llama2/runner/generation.py b/examples/models/llama2/runner/generation.py
index 56a15005ef..f1a6b54d88 100644
--- a/examples/models/llama2/runner/generation.py
+++ b/examples/models/llama2/runner/generation.py
@@ -4,34 +4,23 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-
-import argparse
-
-import json
-from typing import List, Optional, Tuple, TypedDict
+from abc import ABC, abstractmethod
+from typing import List, Optional, TypedDict
 
 import torch
-import torch.nn.functional as F
+
 from executorch.examples.models.llama2.llama_transformer import ModelArgs
+from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer
+
+from executorch.extension.pybindings import portable_lib  # noqa # usort: skip
 
-from executorch.examples.models.llama2.tokenizer.tiktoken import (
-    Dialog,
-    Message,
-    Tokenizer,
-)
-from executorch.extension.pybindings.portable_lib import _load_for_executorch
+# Note: import this after portable_lib
+from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa # usort: skip
 
 
 class CompletionPrediction(TypedDict, total=False):
     generation: str
     tokens: List[str]  # not required
-    logprobs: List[float]  # not required
-
-
-class ChatPrediction(TypedDict, total=False):
-    generation: Message
-    tokens: List[str]  # not required
-    logprobs: List[float]  # not required
 
 
 def sample_top_p(probs, p):
@@ -47,7 +36,7 @@ def sample_top_p(probs, p):
 
     Note:
         Top-p sampling selects the smallest set of tokens whose cumulative probability mass
-        exceeds the threshold p. The distribution is renormalized based on the selected tokens.
+        exceeds the threshold p. The distribution is re-normalized based on the selected tokens.
     """
     probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
     probs_sum = torch.cumsum(probs_sort, dim=-1)
@@ -59,312 +48,92 @@ def sample_top_p(probs, p):
     return next_token
 
 
-class LlamaRunner:
-    def __init__(self, model_path: str, tokenizer_path: str, model_args: ModelArgs):
-        # model is a pte file.
-        self.model = _load_for_executorch(model_path)
+def next_token(logits: torch.Tensor, temperature: float, top_p: float) -> int:
+    if temperature > 0:
+        probs = torch.softmax(logits[:, -1] / temperature, dim=-1)
+        return sample_top_p(probs, top_p).item()
+    return torch.argmax(logits[:, -1], dim=-1).item()
+
+
+class LlamaRunner(ABC):
+    def __init__(self, tokenizer_path: str, model_args: ModelArgs):
         self.params = model_args
         self.tokenizer = Tokenizer(tokenizer_path)
         assert model_args.vocab_size == self.tokenizer.n_words
 
+    @abstractmethod
+    def forward(
+        self,
+        tokens: Optional[torch.LongTensor] = None,
+        input_pos: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        pass
+
     def generate(  # noqa: C901
         self,
-        prompt_tokens: List[List[int]],
-        max_gen_len: int,
+        prompt_tokens: List[int],
         temperature: float = 0.8,
         top_p: float = 0.9,
-        logprobs: bool = False,
         echo: bool = False,
-    ) -> Tuple[List[List[int]], Optional[List[List[float]]]]:
-        bsz = len(prompt_tokens)
-        params = self.params
-        assert bsz <= params.max_batch_size, (bsz, params.max_batch_size)
-
-        min_prompt_len = min(len(t) for t in prompt_tokens)
-        max_prompt_len = max(len(t) for t in prompt_tokens)
-
-        assert max_prompt_len <= params.max_seq_len
-        total_len = min(params.max_seq_len, max_gen_len + max_prompt_len)
-        pad_id = self.tokenizer.pad_id
-        tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device="cpu")
-        for k, t in enumerate(prompt_tokens):
-            tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device="cpu")
-        if logprobs:
-            token_logprobs = torch.zeros_like(tokens, dtype=torch.float)
-
-        prev_pos = 0
-        if self.params.use_kv_cache:
-            min_prompt_len = 1
-
-        eos_reached = torch.tensor([False] * bsz, device="cpu")
-        input_text_mask = tokens != pad_id
-        pos = torch.tensor([prev_pos], dtype=torch.int64)
-        if min_prompt_len == total_len:
-            if self.params.use_kv_cache:
-                inputs = (tokens, pos)
-            else:
-                inputs = (tokens,)
-            logits = self.model.forward(inputs)  # updated forward call.
-            logits = logits[0]
-            token_logprobs = -F.cross_entropy(
-                input=logits.transpose(1, 2),
-                target=tokens,
-                reduction="none",
-                ignore_index=pad_id,
-            )
+    ) -> List[int]:
+        # prefill
+        logits = self.forward(
+            tokens=torch.tensor([prompt_tokens], dtype=torch.long),
+            input_pos=(
+                torch.tensor([0], dtype=torch.long)
+                if self.params.use_kv_cache
+                else None
+            ),
+        )
 
-        stop_tokens = torch.tensor(list(self.tokenizer.stop_tokens))
+        current_token = next_token(logits, temperature, top_p)
+        tokens = prompt_tokens + [current_token]
 
-        for cur_pos in range(min_prompt_len, total_len):
-            pos = torch.tensor([prev_pos], dtype=torch.int64)
+        while len(tokens) < self.params.max_seq_len:
             if self.params.use_kv_cache:
-                inputs = (tokens[:, prev_pos:cur_pos], pos)
-            else:
-                inputs = (tokens[:, :cur_pos],)
-            logits = self.model.forward(inputs)  # updated forward call.
-            logits = logits[0]
-            if temperature > 0:
-                probs = torch.softmax(logits[:, -1] / temperature, dim=-1)
-                next_token = sample_top_p(probs, top_p)
-            else:
-                next_token = torch.argmax(logits[:, -1], dim=-1)
-
-            next_token = next_token.reshape(-1)
-
-            # only replace token if prompt has already been generated
-            if not self.params.use_kv_cache or cur_pos < len(prompt_tokens[0]):
-                next_token = torch.where(
-                    input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token
-                )
-
-            tokens[:, cur_pos] = next_token
-            if logprobs:
-                token_logprobs[:, prev_pos + 1 : cur_pos + 1] = -F.cross_entropy(
-                    input=logits.transpose(1, 2),
-                    target=tokens[:, prev_pos + 1 : cur_pos + 1],
-                    reduction="none",
-                    ignore_index=pad_id,
+                logits = self.forward(
+                    tokens=torch.tensor([[current_token]], dtype=torch.long),
+                    input_pos=torch.tensor([len(tokens) - 1], dtype=torch.long),
                 )
-            eos_reached |= (~input_text_mask[:, cur_pos]) & (
-                torch.isin(next_token, stop_tokens)
-            )
-            prev_pos = cur_pos
-            if all(eos_reached):
+            else:
+                logits = self.forward(tokens=torch.tensor([tokens], dtype=torch.long))
+            current_token = next_token(logits, temperature, top_p)
+            if current_token in self.tokenizer.stop_tokens:
                 break
+            tokens.append(current_token)
 
-        if logprobs:
-            token_logprobs = token_logprobs.tolist()
-        out_tokens, out_logprobs = [], []
-        for i, toks in enumerate(tokens.tolist()):
-            # cut to max gen len
-            start = 0 if echo else len(prompt_tokens[i])
-            toks = toks[start : len(prompt_tokens[i]) + max_gen_len]
-            probs = None
-            if logprobs:
-                probs = token_logprobs[i][start : len(prompt_tokens[i]) + max_gen_len]
-            # cut to after eos tok if any
-            for stop_token in self.tokenizer.stop_tokens:
-                try:
-                    eos_idx = toks.index(stop_token)
-                    toks = toks[:eos_idx]
-                    probs = probs[:eos_idx] if logprobs else None
-                except ValueError:
-                    pass
-            out_tokens.append(toks)
-            out_logprobs.append(probs)
-        return (out_tokens, out_logprobs if logprobs else None)
+        return tokens if echo else tokens[len(prompt_tokens) :]
 
     def text_completion(
         self,
-        prompts: List[str],
+        prompt: str,
         temperature: float = 0.6,
         top_p: float = 0.9,
-        max_gen_len: Optional[int] = None,
-        logprobs: bool = False,
         echo: bool = False,
-    ) -> List[CompletionPrediction]:
+    ) -> CompletionPrediction:
         """
-        Perform text completion for a list of prompts using the language generation model.
+        Perform text completion for a prompt using the language model.
 
         Args:
-            prompts (List[str]): List of text prompts for completion.
+            prompt (str): Text prompt for completion.
             temperature (float, optional): Temperature value for controlling randomness in sampling. Defaults to 0.6.
             top_p (float, optional): Top-p probability threshold for nucleus sampling. Defaults to 0.9.
-            max_gen_len (Optional[int], optional): Maximum length of the generated completion sequence.
-                If not provided, it's set to the model's maximum sequence length minus 1.
-            logprobs (bool, optional): Flag indicating whether to compute token log probabilities. Defaults to False.
             echo (bool, optional): Flag indicating whether to include prompt tokens in the generated output. Defaults to False.
 
         Returns:
-            List[CompletionPrediction]: List of completion predictions, each containing the generated text completion.
+            CompletionPrediction: Completion prediction, which contains the generated text completion.
 
         Note:
-            This method generates text completions for the provided prompts, employing nucleus sampling to introduce controlled randomness.
-            If logprobs is True, token log probabilities are computed for each generated token.
+            This method generates text completion for the provided prompt, employing nucleus sampling to introduce controlled randomness.
         """
-        if max_gen_len is None:
-            max_gen_len = self.model.params.max_seq_len - 1
-        prompt_tokens = [self.tokenizer.encode(x, bos=True, eos=False) for x in prompts]
-        generation_tokens, generation_logprobs = self.generate(
+        prompt_tokens = self.tokenizer.encode(prompt, bos=True, eos=False)
+        generation_tokens = self.generate(
             prompt_tokens=prompt_tokens,
-            max_gen_len=max_gen_len,
             temperature=temperature,
             top_p=top_p,
-            logprobs=logprobs,
             echo=echo,
         )
-
-        if logprobs:
-            return [
-                {
-                    "generation": self.tokenizer.decode(t),
-                    "tokens": [self.tokenizer.decode([x]) for x in t],
-                    "logprobs": logprobs_i,
-                }
-                for t, logprobs_i in zip(generation_tokens, generation_logprobs)
-            ]
-        return [{"generation": self.tokenizer.decode(t)} for t in generation_tokens]
-
-    def chat_completion(
-        self,
-        dialogs: List[Dialog],
-        temperature: float = 0.6,
-        top_p: float = 0.9,
-        max_gen_len: Optional[int] = None,
-        logprobs: bool = False,
-    ) -> List[ChatPrediction]:
-        """
-        Generate assistant responses for a list of conversational dialogs using the language generation model.
-
-        Args:
-            dialogs (List[Dialog]): List of conversational dialogs, where each dialog is a list of messages.
-            temperature (float, optional): Temperature value for controlling randomness in sampling. Defaults to 0.6.
-            top_p (float, optional): Top-p probability threshold for nucleus sampling. Defaults to 0.9.
-            max_gen_len (Optional[int], optional): Maximum length of the generated response sequence.
-                If not provided, it's set to the model's maximum sequence length minus 1.
-            logprobs (bool, optional): Flag indicating whether to compute token log probabilities. Defaults to False.
-
-        Returns:
-            List[ChatPrediction]: List of chat predictions, each containing the assistant's generated response.
-
-        Raises:
-            AssertionError: If the last message in a dialog is not from the user.
-            AssertionError: If the dialog roles are not in the required 'user', 'assistant', and optional 'system' order.
-
-        Note:
-            This method generates assistant responses for the provided conversational dialogs.
-            It employs nucleus sampling to introduce controlled randomness in text generation.
-            If logprobs is True, token log probabilities are computed for each generated token.
-        """
-        if max_gen_len is None:
-            max_gen_len = self.model.params.max_seq_len - 1
-
-        prompt_tokens = [
-            self.formatter.encode_dialog_prompt(dialog) for dialog in dialogs
-        ]
-        generation_tokens, generation_logprobs = self.generate(
-            prompt_tokens=prompt_tokens,
-            max_gen_len=max_gen_len,
-            temperature=temperature,
-            top_p=top_p,
-            logprobs=logprobs,
-        )
-        if logprobs:
-            return [
-                {
-                    "generation": {
-                        "role": "assistant",
-                        "content": self.tokenizer.decode(t),
-                    },
-                    "tokens": [self.tokenizer.decode([x]) for x in t],
-                    "logprobs": logprobs_i,
-                }
-                for t, logprobs_i in zip(generation_tokens, generation_logprobs)
-            ]
-        return [
-            {
-                "generation": {
-                    "role": "assistant",
-                    "content": self.tokenizer.decode(t),
-                },
-            }
-            for t in generation_tokens
-        ]
-
-
-def build_args_parser() -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "-f",
-        "--pte",
-        type=str,
-        default=None,
-        help="path to exported executorch .pte file",
-    )
-
-    parser.add_argument(
-        "-p", "--params", type=str, default=None, help="model params file"
-    )
-
-    parser.add_argument(
-        "-t",
-        "--tokenizer",
-        type=str,
-        default=None,
-    )
-
-    parser.add_argument(
-        "--prompt",
-        type=str,
-        default="Hello",
-    )
-
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        default=0.6,
-    )
-
-    parser.add_argument(
-        "-kv",
-        "--kv_cache",
-        default=False,
-        action="store_true",
-    )
-
-    parser.add_argument(
-        "--max_gen_len",
-        type=int,
-        default=10,
-        help="Maximum length of the generated response sequence.",
-    )
-
-    return parser
-
-
-def main() -> None:
-    parser = build_args_parser()
-    args = parser.parse_args()
-
-    with open(args.params, "r") as f:
-        params = json.loads(f.read())
-    model_args: ModelArgs = ModelArgs(
-        max_seq_len=128,
-        max_batch_size=1,
-        use_kv_cache=args.kv_cache,
-        **params,
-    )
-    runner = LlamaRunner(
-        model_path=args.pte, tokenizer_path=args.tokenizer, model_args=model_args
-    )
-    result = runner.text_completion(
-        prompts=[args.prompt],
-        max_gen_len=args.max_gen_len,
-        temperature=args.temperature,
-    )
-    print(f"Result: {result}")
-
-
-if __name__ == "__main__":
-    main()  # pragma: no cover
+        return {
+            "generation": self.tokenizer.decode(generation_tokens),
+            "tokens": generation_tokens,
+        }
diff --git a/examples/models/llama2/runner/native.py b/examples/models/llama2/runner/native.py
new file mode 100644
index 0000000000..cefafc1a88
--- /dev/null
+++ b/examples/models/llama2/runner/native.py
@@ -0,0 +1,115 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import json
+from typing import Optional
+
+import torch
+
+from examples.models.llama2.llama_transformer import ModelArgs
+from executorch.extension.pybindings.portable_lib import _load_for_executorch
+
+from .generation import LlamaRunner
+
+
+class NativeLlamaRunner(LlamaRunner):
+    """
+    Runs llama via ExecuTorch with provided pte file.
+    """
+
+    def __init__(self, args):
+        with open(args.params, "r") as f:
+            params = json.loads(f.read())
+        model_args: ModelArgs = ModelArgs(
+            max_seq_len=args.max_len,
+            max_batch_size=1,
+            use_kv_cache=args.kv_cache,
+            **params,
+        )
+        super().__init__(tokenizer_path=args.tokenizer, model_args=model_args)
+        self.model = _load_for_executorch(args.pte)
+
+    def forward(
+        self,
+        tokens: Optional[torch.LongTensor] = None,
+        input_pos: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        return (
+            self.model.forward((tokens, input_pos))
+            if input_pos is not None
+            else self.model.forward((tokens,))
+        )[0]
+
+
+def build_args_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-f",
+        "--pte",
+        type=str,
+        default=None,
+        help="path to exported executorch .pte file",
+    )
+
+    parser.add_argument(
+        "-p", "--params", type=str, default=None, help="model params file"
+    )
+
+    parser.add_argument(
+        "-t",
+        "--tokenizer",
+        type=str,
+        default=None,
+    )
+
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="Hello",
+    )
+
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.6,
+    )
+
+    parser.add_argument(
+        "-kv",
+        "--kv_cache",
+        default=True,
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--max_len",
+        type=int,
+        default=128,
+        help="Maximum length of the generated response sequence.",
+    )
+
+    return parser
+
+
+def main() -> None:
+    parser = build_args_parser()
+    args = parser.parse_args()
+    runner = NativeLlamaRunner(args)
+    result = runner.text_completion(
+        prompt=args.prompt,
+        temperature=args.temperature,
+    )
+    print(
+        "Response: \n{response}\n Tokens:\n {tokens}".format(
+            response=result["generation"], tokens=result["tokens"]
+        )
+    )
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover
diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp
index cd6d9c9e7c..fee5a86b36 100644
--- a/examples/models/llama2/runner/runner.cpp
+++ b/examples/models/llama2/runner/runner.cpp
@@ -10,29 +10,31 @@
 // The module takes in a string as input and emits a string as output.
 
 #include <executorch/examples/models/llama2/runner/runner.h>
+
+#include <ctime>
+
+#include <executorch/extension/llm/runner/util.h>
+#include <executorch/extension/runner_util/managed_tensor.h>
+
 #if ET_USE_TIKTOKEN
 #include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
 #else /* BPE */
 #include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
 #endif /* ET_USE_TIKTOKEN*/
-#include <executorch/extension/evalue_util/print_evalue.h>
-#include <executorch/extension/module/metadata_util.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
-
-#include <ctime>
-#include <memory>
-#include <sstream>
-
-#ifdef USE_ATEN_LIB
-#include <torch/torch.h>
-#endif
-
-#include <executorch/examples/models/llama2/runner/util.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
-#include <executorch/runtime/platform/log.h>
 
 namespace torch::executor {
+namespace {
+static constexpr auto kAppendEosToPrompt = "append_eos_to_prompt";
+static constexpr auto kEnableDynamicShape = "enable_dynamic_shape";
+static constexpr auto kBosId = "get_bos_id";
+static constexpr auto kEosId = "get_eos_id";
+static constexpr auto kMaxSeqLen = "get_max_seq_len";
+static constexpr auto kNBos = "get_n_bos";
+static constexpr auto kNEos = "get_n_eos";
+static constexpr auto kVocabSize = "get_vocab_size";
+static constexpr auto kUseKVCache = "use_kv_cache";
+static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
+} // namespace
 
 Runner::Runner(
     const std::string& model_path,
@@ -41,9 +43,25 @@ Runner::Runner(
     // NOTE: we observed ~2x loading performance increase on iPhone 15
     // and a ~5% improvement on Galaxy S22 by switching to
     // FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
-    : module_(std::make_unique<Module>(model_path, Module::LoadMode::File)),
+    : temperature_(temperature),
+      module_(std::make_unique<Module>(model_path, Module::LoadMode::File)),
       tokenizer_path_(tokenizer_path),
-      temperature_(temperature) {
+      tokenizer_(
+#if ET_USE_TIKTOKEN
+          get_tiktoken_for_llama()
+#else
+          std::make_unique<BPETokenizer>()
+#endif
+              ),
+      metadata_({
+          {kAppendEosToPrompt, false},
+          {kEnableDynamicShape, false},
+          {kMaxSeqLen, 128},
+          {kNBos, 1},
+          {kNEos, 1},
+          {kUseKVCache, true},
+          {kUseSDPAWithKVCache, false},
+      }) {
   ET_LOG(
       Info,
       "Creating LLaMa runner: model_path=%s, tokenizer_path=%s",
@@ -52,7 +70,8 @@ Runner::Runner(
 }
 
 bool Runner::is_loaded() const {
-  return module_->is_loaded() && tokenizer_ && sampler_;
+  return module_->is_loaded() && tokenizer_ && text_decoder_runner_ &&
+      text_prefiller_ && text_token_generator_;
 }
 
 Error Runner::load() {
@@ -61,250 +80,52 @@ Error Runner::load() {
   }
   ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward"));
 
-  // Read out metadata: vocab_size (expected by the model), BOS, EOS, n_BOS,
-  // n_EOS max_seq_len from the model
-  ET_LOG(Info, "Reading metadata from model");
-  const auto method_names = module_->method_names();
-  ET_CHECK_MSG(method_names.ok(), "Failed to read method names from model");
-  model_methods_ = method_names.get();
-  n_bos_ = get_module_metadata<int64_t>(module_.get(), "get_n_bos", 1);
-  n_eos_ = get_module_metadata<int64_t>(module_.get(), "get_n_eos", 1);
-  max_seq_len_ =
-      get_module_metadata<int64_t>(module_.get(), "get_max_seq_len", 128);
-  use_kv_cache_ = get_module_metadata(module_.get(), "use_kv_cache", true);
-  use_sdpa_with_kv_cache_ =
-      get_module_metadata(module_.get(), "use_sdpa_with_kv_cache", false);
-  append_eos_ =
-      get_module_metadata(module_.get(), "append_eos_to_prompt", false);
-  enable_parallel_prefill_ =
-      get_module_metadata(module_.get(), "enable_dynamic_shape", false);
-
-  // Load tokenizer
-#if ET_USE_TIKTOKEN
-  tokenizer_ = get_tiktoken_for_llama();
-#else
-  tokenizer_ = std::make_unique<BPETokenizer>();
-#endif
   tokenizer_->load(tokenizer_path_);
 
-  vocab_size_ = get_module_metadata<int64_t>(
-      module_.get(), "get_vocab_size", tokenizer_->vocab_size());
-  bos_id_ = get_module_metadata<int64_t>(
-      module_.get(), "get_bos_id", tokenizer_->bos_tok());
-  eos_id_ = get_module_metadata<int64_t>(
-      module_.get(), "get_eos_id", tokenizer_->eos_tok());
-
-  // Create sampler
-  sampler_ = std::make_unique<Sampler>(
-      vocab_size_,
-      temperature_,
-      ::executorch::llm::kTopp,
-      static_cast<unsigned long long>(std::time(nullptr)));
-
-  return Error::Ok;
-}
-
-int32_t Runner::logitsToToken(const exec_aten::Tensor& logits_tensor) {
-  ET_CHECK_MSG(logits_tensor.dim() == 3, "Logits tensor must be 3D");
-  auto num_tokens = logits_tensor.size(1);
+  ET_LOG(Info, "Reading metadata from model");
 
-  switch (logits_tensor.scalar_type()) {
-    case ScalarType::Float: {
-      float* logits = logits_tensor.mutable_data_ptr<float>();
-      float* logits_last = logits;
-      logits_last += (num_tokens - 1) * tokenizer_->vocab_size();
-      return sampler_->sample(logits_last);
-    }
-    case ScalarType::Half: {
-      exec_aten::Half* logits =
-          logits_tensor.mutable_data_ptr<exec_aten::Half>();
-      exec_aten::Half* logits_last = logits;
-      logits_last += (num_tokens - 1) * tokenizer_->vocab_size();
-      return sampler_->sample(logits_last);
+  metadata_[kBosId] = tokenizer_->bos_tok();
+  metadata_[kEosId] = tokenizer_->eos_tok();
+  metadata_[kVocabSize] = tokenizer_->vocab_size();
+
+  const auto method_names =
+      ET_UNWRAP(module_->method_names(), "Failed reading method names");
+
+  for (auto& pair : metadata_) {
+    const auto& method_name = pair.first;
+    auto& value = pair.second;
+
+    if (method_names.count(method_name)) {
+      value = ET_UNWRAP(module_->get(method_name))
+                  .toScalar()
+                  .to<decltype(metadata_)::mapped_type>();
+    } else {
+      ET_LOG(
+          Info,
+          "Methond %s not found, using the default value %" PRId64,
+          method_name.c_str(),
+          value);
     }
-    default:
-      ET_CHECK_MSG(
-          false,
-          "Unsupported dtype output %hhd",
-          static_cast<int8_t>(logits_tensor.scalar_type()));
   }
-}
-
-Result<exec_aten::Tensor> Runner::prefill(
-    const std::vector<uint64_t>& tokens,
-    ManagedTensor& managed_tokens,
-    ManagedTensor& managed_start_pos,
-    std::function<void(const std::string&)> token_callback) {
-  // enable_parallel_prefill_ maybe set even when not using kv cache
-  // When kv cache is not used, start pos is ignored
-  int32_t num_tokens = tokens.size();
-  if (enable_parallel_prefill_) {
-    managed_tokens.resize({1, num_tokens});
-    int64_t* tokens_ptr =
-        managed_tokens.get_aliasing_tensor().mutable_data_ptr<int64_t>();
-    for (int i = 0; i < num_tokens; i++) {
-      // The following assumes batch size = 1
-      tokens_ptr[i] = tokens[i];
-    }
-    std::vector<EValue> inputs;
-    auto tokens_tensor = managed_tokens.get_aliasing_tensor();
-    auto start_pos = managed_start_pos.get_aliasing_tensor();
-
-    // inputs:[tokens, start_pos]
-    inputs.push_back(tokens_tensor);
-    inputs.push_back(start_pos);
-
-    Result<std::vector<EValue>> outputs_res = module_->forward(inputs);
-    ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
-    ET_CHECK_MSG(
-        outputs_res.get()[0].isTensor(),
-        "Non Tensor Output returned from executing LLM");
-    ET_CHECK_MSG(
-        outputs_res.get()[0].toTensor().size(1) == num_tokens,
-        "Expected number of output tokens %d does not match returned value %zu.",
-        num_tokens,
-        outputs_res.get()[0].toTensor().size(1));
-
-    start_pos.mutable_data_ptr<int64_t>()[0] = num_tokens;
-
-    uint64_t prev = tokens[0];
-    uint64_t cur;
-    for (int i = 1; i < num_tokens; i++) {
-      cur = tokens[i];
-      auto piece_res = tokenizer_->decode(prev, cur);
-      ET_CHECK_OK_OR_RETURN_ERROR(piece_res.error());
-      util::safe_printf(piece_res.get().c_str());
-      fflush(stdout);
-      prev = cur;
-      if (token_callback) {
-        token_callback(piece_res.get().c_str());
-      }
-    }
-    cur = logitsToToken(outputs_res.get()[0].toTensor());
-    auto piece_res = tokenizer_->decode(prev, cur);
-    ET_CHECK(piece_res.ok());
-    const char* piece = piece_res.get().c_str();
-    util::safe_printf(piece);
-    fflush(stdout);
-    if (token_callback) {
-      token_callback(piece_res.get().c_str());
-    }
+  text_decoder_runner_ = std::make_unique<TextDecoderRunner>(
+      module_.get(),
+      metadata_.at(kUseKVCache),
+      metadata_.at(kVocabSize),
+      temperature_);
+  text_prefiller_ = std::make_unique<TextPrefiller>(
+      tokenizer_.get(),
+      text_decoder_runner_.get(),
+      metadata_.at(kUseKVCache),
+      enable_parallel_prefill_);
+
+  text_token_generator_ = std::make_unique<TextTokenGenerator>(
+      tokenizer_.get(),
+      text_decoder_runner_.get(),
+      metadata_.at(kUseKVCache),
+      metadata_.at(kEosId),
+      &stats_);
 
-    // Return the logits tensor
-    stats_.first_token_ms = util::time_in_ms();
-    stats_.prompt_eval_end_ms = util::time_in_ms();
-    return outputs_res.get()[0].toTensor();
-  } else { // sequential prefill
-    int64_t pos = 0; // position in the sequence
-    int64_t cur_token = tokens[0];
-    int64_t prev_token;
-    // This is a hack to enable returning a logits tensor from prefill
-    auto logits_tensor = managed_tokens.get_aliasing_tensor();
-    while (pos < num_tokens) {
-      // Run the model
-      Result<exec_aten::Tensor> logits_res = run_model_step(
-          cur_token, managed_tokens, managed_start_pos, num_tokens);
-
-      ET_CHECK_OK_OR_RETURN_ERROR(logits_res.error());
-      logits_tensor = logits_res.get();
-      // Hack to enable returning a logits tensor from prefill
-
-      prev_token = cur_token;
-
-      long sample_start_time_ms = util::time_in_ms();
-      cur_token = logitsToToken(logits_tensor);
-      stats_.aggregate_sampling_time_ms +=
-          util::time_in_ms() - sample_start_time_ms;
-
-      // advance the state machine
-      if (pos < num_tokens - 1) {
-        // prefill, force the next token to be the next prompt token
-        cur_token = tokens[pos + 1];
-      }
-      pos++;
-
-      // print the token as string, decode it with the Tokenizer object
-      auto piece_res = tokenizer_->decode(prev_token, cur_token);
-      ET_CHECK(piece_res.ok());
-      const char* piece = piece_res.get().c_str();
-      util::safe_printf(piece);
-      fflush(stdout);
-      if (token_callback) {
-        token_callback(piece_res.get().c_str());
-      }
-    }
-    auto start_pos = managed_start_pos.get_aliasing_tensor();
-    start_pos.mutable_data_ptr<int64_t>()[0] = num_tokens;
-    stats_.first_token_ms = util::time_in_ms();
-    stats_.prompt_eval_end_ms = util::time_in_ms();
-    return logits_tensor;
-  }
-}
-
-// Given an input token. Set up the inputs for the model and execute a single
-// step. Returning the logits tensor.
-Result<exec_aten::Tensor> Runner::run_model_step(
-    int64_t input_token,
-    ManagedTensor& managed_tokens,
-    ManagedTensor& managed_start_pos,
-    size_t max_seq_len) {
-  // ET_LOG(Info, "Input token %" PRIu64, input_token);
-  if (use_kv_cache_) {
-    auto tokens = managed_tokens.get_aliasing_tensor();
-    auto start_pos = managed_start_pos.get_aliasing_tensor();
-
-    // When using kv-cache our input is always 1 token, so just update to the
-    // latest.
-    tokens.mutable_data_ptr<int64_t>()[0] = input_token;
-
-    Result<std::vector<EValue>> outputs_res =
-        module_->forward({tokens, start_pos});
-    ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
-    ET_CHECK_MSG(
-        outputs_res.get().size() == 1,
-        "More then one output returned from executing LLM.");
-    ET_CHECK_MSG(
-        outputs_res.get()[0].isTensor(),
-        "Non Tensor Output returned from executing LLM");
-
-    // Bump start_pos by 1
-    start_pos.mutable_data_ptr<int64_t>()[0]++;
-
-    // Return the logits tensor
-    return outputs_res.get()[0].toTensor();
-  } else { // no kv cache
-    std::vector<EValue> inputs;
-    auto tokens = managed_tokens.get_aliasing_tensor();
-    (void)managed_start_pos; // unused
-
-    // When not using kv-cache our input is the entire history of tokens we have
-    // seen, so resize input to be 1 larger and append the new token to the end.
-    // TODO does this work in ATen mode?
-    tokens.mutable_data_ptr<int64_t>()[tokens.size(1) - 1] = input_token;
-
-    // inputs:[tokens]
-    inputs.push_back(tokens);
-
-    Result<std::vector<EValue>> outputs_res = module_->forward(inputs);
-    ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
-    ET_CHECK_MSG(
-        outputs_res.get().size() == 1,
-        "More then one output returned from executing LLM.");
-    ET_CHECK_MSG(
-        outputs_res.get()[0].isTensor(),
-        "Non Tensor Output returned from executing LLM");
-
-    if (tokens.size(1) < max_seq_len) {
-      // Resize the tokens tensor to be 1 larger for next step.
-      // Note that this relies on the fact that underlying memory is the same
-      // such that previous tokens stored there will still exist.
-      // Not a good thing to rely upon.
-      managed_tokens.resize({1, static_cast<int>(tokens.size(1) + 1)});
-    }
-
-    // Return the logits tensor
-    return outputs_res.get()[0].toTensor();
-  }
+  return Error::Ok;
 }
 
 Error Runner::generate(
@@ -321,6 +142,15 @@ Error Runner::generate(
     stats_.model_load_end_ms = util::time_in_ms();
   }
 
+  // Wrap the token_callback with print function
+  std::function<void(const std::string&)> wrapped_callback =
+      [token_callback](const std::string& piece) {
+        util::safe_printf(piece.c_str());
+        fflush(stdout);
+        if (token_callback) {
+          token_callback(piece);
+        }
+      };
   // First token time only measures the time it takes to encode the prompt and
   // return a response token.
 
@@ -328,10 +158,14 @@ Error Runner::generate(
   shouldStop_ = false;
 
   // Set the sequence length to the max seq length if not provided
-  seq_len = (seq_len > 0 && seq_len <= max_seq_len_) ? seq_len : max_seq_len_;
+  seq_len = (seq_len > 0 && seq_len <= metadata_.at(kMaxSeqLen))
+      ? seq_len
+      : metadata_.at(kMaxSeqLen);
 
-  Result<std::vector<uint64_t>> encode_res =
-      tokenizer_->encode(prompt, n_bos_, append_eos_ ? n_eos_ : 0);
+  Result<std::vector<uint64_t>> encode_res = tokenizer_->encode(
+      prompt,
+      metadata_.at(kNBos),
+      metadata_.at(kAppendEosToPrompt) ? metadata_.at(kNEos) : 0);
 
   ET_CHECK_OK_OR_RETURN_ERROR(
       encode_res.error(), "Failed to encode prompt %s", prompt.c_str());
@@ -342,123 +176,44 @@ Error Runner::generate(
 
   ET_CHECK_MSG(num_prompt_tokens >= 1, "Expected at least 1 prompt token");
   ET_CHECK_MSG(
-      num_prompt_tokens < max_seq_len_,
-      "Max seq length exceeded - please increase max seq len value in .../llama2/model.py");
-
+      num_prompt_tokens < metadata_.at(kMaxSeqLen),
+      "num_prompt_tokens %d >= max_seq_len_ %" PRId64
+      ", Max seq length exceeded - please increase max seq len value in .../llama2/model.py",
+      num_prompt_tokens,
+      metadata_.at(kMaxSeqLen));
   ET_CHECK_MSG(
       num_prompt_tokens < seq_len,
-      "Sequence length exceeded - please increase the seq_len value passed to generate()");
-
-  // start the main loop
-  int64_t pos = 0; // position in the sequence
-
-  std::vector<int64_t> token_data; // allocate space for the tokens
-  std::vector<exec_aten::SizesType> token_shape = {1, seq_len};
-
-  std::vector<int64_t> start_pos_data; // allocate space for the tokens
-  std::vector<exec_aten::SizesType> start_pos_shape = {1};
-
-  token_data.resize(seq_len);
-  if (use_kv_cache_) {
-    // hard code these to size 1 as kv cache is locked to static size right now.
-    start_pos_data.resize(1);
-    start_pos_data.push_back(0);
-  }
-
-  // initialize tensor wrappers
-  ManagedTensor tokens_managed(
-      token_data.data(), token_shape, ScalarType::Long);
-  // Create with the max shape to approapriately set the capacity of this
-  // tensor, then resize back to 1 for first input.
-  tokens_managed.resize({1, 1});
-
-  ManagedTensor start_pos_managed(
-      start_pos_data.data(), start_pos_shape, ScalarType::Long);
-
-  int64_t prev_token;
-  int64_t cur_token = prompt_tokens[0];
+      "num_prompt_tokens %d >= seq_len %d, Sequence length exceeded - please increase the seq_len value passed to generate()",
+      num_prompt_tokens,
+      seq_len);
 
   // Prefill first
   // Here feed all tokens to the model and get the next predicted token
   // after the prompt. After that we will enter generate loop.
   auto prefill_res =
-      prefill(prompt_tokens, tokens_managed, start_pos_managed, token_callback);
+      text_prefiller_->prefill(prompt_tokens, 0, wrapped_callback);
+  stats_.first_token_ms = util::time_in_ms();
+  stats_.prompt_eval_end_ms = util::time_in_ms();
   ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
-  exec_aten::Tensor& prefill_res_tensor = prefill_res.get();
-  cur_token = logitsToToken(prefill_res_tensor);
-  if (use_kv_cache_) {
-    // Prefill could be parallel or sequential.
-    // Parallel:
-    //  kv cache:
-    //    - tokens_managed should resized to 1 as inference expects one token at
-    //    a time.
-    //  no kv cache:
-    //    - tokens_managed should be resized to prompt length + 1, as inference
-    //    expects all tokens at once.
-    // Sequential prefill:
-    //  kv cache:
-    //     - tokens_managed should be resized to 1, as inference expects one
-    //     token at a time.
-    //  no kv cache:
-    //     - tokens_managed should be resized to prompt length + 1, as inference
-    //     expects all tokens at once.
-    tokens_managed.resize({1, 1});
-  } else {
-    tokens_managed.resize({1, num_prompt_tokens + 1});
-  }
-  pos = num_prompt_tokens;
-
-  // Generate our tokens
-  while (pos < seq_len - 1) {
-    // Run the model
-    Result<exec_aten::Tensor> logits_res =
-        run_model_step(cur_token, tokens_managed, start_pos_managed, seq_len);
-
-    ET_CHECK_OK_OR_RETURN_ERROR(logits_res.error());
-    exec_aten::Tensor& logits_tensor = logits_res.get();
-
-    prev_token = cur_token;
-
-    long sample_start_time_ms = util::time_in_ms();
-    cur_token = logitsToToken(logits_tensor);
-    stats_.aggregate_sampling_time_ms +=
-        util::time_in_ms() - sample_start_time_ms;
-
-    pos++;
-
-    // print the token as string, decode it with the Tokenizer object
-    auto piece_res = tokenizer_->decode(prev_token, cur_token);
-    ET_CHECK(piece_res.ok());
-    const char* piece = piece_res.get().c_str();
-
-    // same as printf("%s", piece), but skips "unsafe" bytes
-    util::safe_printf(piece);
-    fflush(stdout);
+  uint64_t cur_token = prefill_res.get();
 
-    if (token_callback) {
-      token_callback(piece);
-    }
+  // print the first token from prefill. No prev_token so use cur_token for it.
+  wrapped_callback(ET_UNWRAP(tokenizer_->decode(cur_token, cur_token)));
 
-    if (shouldStop_) {
-      break;
-    }
+  // start the main loop
+  prompt_tokens.push_back(cur_token);
+  int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
+      prompt_tokens, num_prompt_tokens, seq_len, wrapped_callback));
 
-    // data-dependent terminating condition: we have n_eos_ number of EOS
-    if (pos >= num_prompt_tokens && cur_token == eos_id_) {
-      printf("\n");
-      ET_LOG(Info, "\nReached to the end of generation");
-      break;
-    }
-  }
   stats_.inference_end_ms = util::time_in_ms();
   printf("\n");
 
-  if (pos == seq_len) {
+  if (num_prompt_tokens + num_generated_tokens == seq_len) {
     ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len);
   }
 
   stats_.num_prompt_tokens = num_prompt_tokens;
-  stats_.num_generated_tokens = pos - num_prompt_tokens;
+  stats_.num_generated_tokens = num_generated_tokens;
   ::executorch::llm::print_report(stats_);
   if (stats_callback) {
     stats_callback(stats_);
@@ -468,6 +223,10 @@ Error Runner::generate(
 }
 
 void Runner::stop() {
-  shouldStop_ = true;
+  if (is_loaded()) {
+    text_token_generator_->stop();
+  } else {
+    ET_LOG(Error, "Token generator is not loaded, cannot stop");
+  }
 }
 } // namespace torch::executor
diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h
index 2c313fd6fe..12fb63c6f3 100644
--- a/examples/models/llama2/runner/runner.h
+++ b/examples/models/llama2/runner/runner.h
@@ -15,14 +15,14 @@
 #include <functional>
 #include <memory>
 #include <string>
-#include <type_traits>
 #include <unordered_map>
 
 #include <executorch/extension/llm/runner/stats.h>
-#include <executorch/extension/llm/sampler/sampler.h>
+#include <executorch/extension/llm/runner/text_decoder_runner.h>
+#include <executorch/extension/llm/runner/text_prefiller.h>
+#include <executorch/extension/llm/runner/text_token_generator.h>
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
 
 namespace torch::executor {
 using Stats = ::executorch::llm::Stats;
@@ -44,37 +44,21 @@ class Runner {
   void stop();
 
  private:
-  int32_t logitsToToken(const exec_aten::Tensor& logits_tensor);
-  Result<exec_aten::Tensor> prefill(
-      const std::vector<uint64_t>& tokens,
-      ManagedTensor& managed_tokens,
-      ManagedTensor& managed_start_pos,
-      std::function<void(const std::string&)> token_callback);
-  Result<exec_aten::Tensor> run_model_step(
-      int64_t input_token,
-      ManagedTensor& tokens,
-      ManagedTensor& start_pos,
-      size_t max_seq_len);
-  // metadata
-  int32_t vocab_size_;
-  int32_t bos_id_;
-  int32_t eos_id_;
-  int32_t n_bos_;
-  int32_t n_eos_;
-  int32_t max_seq_len_;
-  bool use_kv_cache_;
-  bool use_sdpa_with_kv_cache_;
-  bool append_eos_;
-  std::unordered_set<std::string> model_methods_;
-  std::string model_path_;
+  float temperature_;
+  bool enable_parallel_prefill_;
+  bool shouldStop_{false};
+
+  // model
   std::unique_ptr<Module> module_;
   std::string tokenizer_path_;
-  float temperature_;
   std::unique_ptr<Tokenizer> tokenizer_;
-  std::unique_ptr<Sampler> sampler_;
-  bool shouldStop_{false};
+  std::unordered_map<std::string, int64_t> metadata_;
+  std::unique_ptr<TextDecoderRunner> text_decoder_runner_;
+  std::unique_ptr<TextPrefiller> text_prefiller_;
+  std::unique_ptr<TextTokenGenerator> text_token_generator_;
+
+  // stats
   Stats stats_;
-  bool enable_parallel_prefill_;
 };
 
 } // namespace torch::executor
diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama2/runner/targets.bzl
index c0a892e14d..5d8269c744 100644
--- a/examples/models/llama2/runner/targets.bzl
+++ b/examples/models/llama2/runner/targets.bzl
@@ -22,7 +22,6 @@ def define_common_targets():
             ],
             exported_headers = [
                 "runner.h",
-                "util.h",
             ],
             preprocessor_flags = [
                 "-DUSE_ATEN_LIB",
@@ -34,7 +33,9 @@ def define_common_targets():
             exported_deps = [
                 "//executorch/backends/xnnpack:xnnpack_backend",
                 "//executorch/extension/llm/runner:stats",
-                "//executorch/extension/llm/sampler:sampler" + aten_suffix,
+                "//executorch/extension/llm/runner:text_decoder_runner" + aten_suffix,
+                "//executorch/extension/llm/runner:text_prefiller" + aten_suffix,
+                "//executorch/extension/llm/runner:text_token_generator" + aten_suffix,
                 "//executorch/extension/evalue_util:print_evalue" + aten_suffix,
                 "//executorch/extension/runner_util:managed_tensor" + aten_suffix,
                 "//executorch/extension/module:module" + aten_suffix,
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
index f0120d05bb..15c12ddb8e 100644
--- a/examples/models/llava/export_llava.py
+++ b/examples/models/llava/export_llava.py
@@ -24,11 +24,11 @@
 from executorch.examples.models.llama2.source_transformation.sdpa import (
     replace_sdpa_with_custom_op,
 )
+from executorch.examples.models.llava.model import LlavaModel
 from executorch.exir import EdgeCompileConfig
 from executorch.exir.program._program import _to_edge_transform_and_lower
 
 from executorch.extension.llm.export.builder import DType, LLMEdgeManager
-from model import LlavaModel
 from torch.ao.quantization.quantizer.xnnpack_quantizer import (
     get_symmetric_quantization_config,
     XNNPACKQuantizer,
@@ -85,7 +85,7 @@ def forward(self, input_pos, embeddings):
         ["-X", "-qmode", "8da4w", "--group_size", "128", "--embedding-quantize", "4,32"]
     )
     quant_transform = get_quant_weight_transform(args, dtype_override, False)
-    pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args)
+    _, quantizers, _ = get_quantizer_and_quant_params(args)
     source_transforms = []
     if llava.use_sdpa_with_kv_cache_op:
         source_transforms.append(replace_sdpa_with_custom_op)
@@ -121,21 +121,22 @@ def forward(self, images):
     llava_image_encode = LlavaImageEncoder(llava)
 
     # quantizer
-    linear_quantizer = XNNPACKQuantizer()
-    operator_config_dynamic = get_symmetric_quantization_config(
-        is_per_channel=True, is_dynamic=True
-    )
-    linear_quantizer.set_global(operator_config_dynamic)
+    quantizer = XNNPACKQuantizer()
+    quantizer.set_global(get_symmetric_quantization_config())
 
-    manager = LlavaEdgeManager(
-        model=llava_image_encode,
-        modelname="llava_image_encoder",
-        max_seq_len=llava.text_model_args.max_seq_len,  # This may not be right
-        dtype=DType.fp32,
-        use_kv_cache=True,
-        example_inputs=(resized,),
-        dynamic_shapes=dynamic_shapes,
-    ).capture_pre_autograd_graph()
+    manager = (
+        LlavaEdgeManager(
+            model=llava_image_encode,
+            modelname="llava_image_encoder",
+            max_seq_len=llava.text_model_args.max_seq_len,  # This may not be right
+            dtype=DType.fp32,
+            use_kv_cache=True,
+            example_inputs=(resized,),
+            dynamic_shapes=dynamic_shapes,
+        )
+        .capture_pre_autograd_graph()
+        .pt2e_quantize([quantizer])
+    )
 
     # lower to executorch
     with torch.no_grad():
@@ -148,15 +149,7 @@ def forward(self, images):
 
 
 def export_token_embedding(llava, prompt):
-    embed = torch.nn.Embedding(
-        llava.model_.config.vocab_size,
-        llava.model_.config.hidden_size,
-        llava.model_.config.pad_token_id,
-    )
-    embed.load_state_dict(
-        llava.model_.get_model().embed_tokens.state_dict(), strict=True, assign=True
-    )
-    embed = embed.to(torch.float32)
+    embed = llava.embed_tokens
     token_dim_1 = Dim("token_dim_1", min=2, max=3518)
     dynamic_shapes = [{1: token_dim_1}]
     with torch.no_grad():
@@ -166,29 +159,14 @@ def export_token_embedding(llava, prompt):
     return token_embedding_ep
 
 
-def main():
-    parser = ArgumentParser()
-    parser.add_argument(
-        "--use-sdpa-with-kv-cache",
-        default=True,
-        action=BooleanOptionalAction,
-        help="Use sdpa_with_kv_cache custom op in LLava text model.",
-    )
-    parser.add_argument(
-        "--pte-name",
-        default="llava_combined_xnnpack.pte",
-        help="Name of the exported ExecuTorch program.",
-    )
-    args = parser.parse_args()
-    logging.info(
-        f"Exporting Llava model to ExecuTorch with sdpa_with_kv_cache: {args.use_sdpa_with_kv_cache}"
-    )
-    llava_model = LlavaModel(use_sdpa_with_kv_cache_op=args.use_sdpa_with_kv_cache)
+def export_all(llava_model: LlavaModel):
     llava = llava_model.get_eager_model()
 
-    prompt_before_image, resized, prompt_after_image = (
-        llava_model.get_inputs_for_prefill()
-    )
+    (
+        prompt_before_image,
+        resized,
+        prompt_after_image,
+    ) = llava_model.get_inputs_for_prefill()
 
     image_encoder_ep = export_image_encoder(
         llava, resized, llava_model._get_image_dynamic_shapes()
@@ -211,9 +189,7 @@ def main():
             "text_model": text_model_ep,
         },
         partitioner={
-            "image_encoder": [
-                XnnpackPartitioner(config_precisions=ConfigPrecisionType.FP32)
-            ],
+            "image_encoder": [XnnpackPartitioner()],
             "text_model": [
                 XnnpackPartitioner(
                     config_precisions=ConfigPrecisionType.DYNAMIC_QUANT,
@@ -225,6 +201,29 @@ def main():
     )
 
     executorch_program = lowered_and_edge.to_executorch()
+    return executorch_program
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--use-sdpa-with-kv-cache",
+        default=True,
+        action=BooleanOptionalAction,
+        help="Use sdpa_with_kv_cache custom op in LLava text model.",
+    )
+    parser.add_argument(
+        "--pte-name",
+        default="llava_combined_xnnpack.pte",
+        help="Name of the exported ExecuTorch program.",
+    )
+    args = parser.parse_args()
+    logging.info(
+        f"Exporting Llava model to ExecuTorch with sdpa_with_kv_cache: {args.use_sdpa_with_kv_cache}"
+    )
+    llava_model = LlavaModel(use_sdpa_with_kv_cache_op=args.use_sdpa_with_kv_cache)
+
+    executorch_program = export_all(llava_model)
 
     with open(args.pte_name, "wb") as f:
         executorch_program.write_to_file(f)
diff --git a/examples/models/llava/install_requirements.sh b/examples/models/llava/install_requirements.sh
index 68923c2dad..7a7ad3145e 100644
--- a/examples/models/llava/install_requirements.sh
+++ b/examples/models/llava/install_requirements.sh
@@ -6,39 +6,7 @@
 # LICENSE file in the root directory of this source tree.
 
 set -x
-OS=$(uname)
 
-# install llava from the submodule. We can't do pip install llava because it is packaged incorrectly.
-if [[ $OS != "Darwin" ]];
-then
-    #This doesn't work for macos, on python 3.12, because torch 2.1.2 is missing.
-    pip install --force-reinstall -e examples/third-party/LLaVA
-else
-    # manually install dependencies
-    pip install tokenizers==0.15.1 sentencepiece==0.1.99    \
-        shortuuid accelerate==0.21.0 peft                   \
-        pydantic markdown2[all]  scikit-learn==1.2.2        \
-        requests httpx==0.24.0 uvicorn fastapi              \
-        einops==0.6.1 einops-exts==0.0.4 timm==0.6.13
-
-    pip install --force-reinstall -e examples/third-party/LLaVA --no-deps
-fi
-
-# not included in the pip install package, but needed in llava
-pip install protobuf
-
-# bitsandbytes depends on numpy 1.x, which is not compatible with numpy 2.x.
-# Reinstall bitsandbytes to make it compatible.
-pip install bitsandbytes -I
-
-# The deps of llava can have different versions than deps of ExecuTorch.
-# For example, torch version required from llava is older than ExecuTorch.
-# To make both work, recover ExecuTorch's original dependencies by rerunning
-# the install_requirements.sh. Notice this won't install executorch.
-bash -x ./install_requirements.sh --pybind xnnpack
-
-# Newer transformer (4.38) will give TypeError: LlavaLlamaForCausalLM.forward() got an unexpected keyword argument 'cache_position'
-pip install timm==0.6.13
-pip install transformers==4.37.2
+pip install transformers
 
 pip list
diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py
index 35831192b4..9f6d8d32e8 100644
--- a/examples/models/llava/model.py
+++ b/examples/models/llava/model.py
@@ -10,7 +10,6 @@
 
 import re
 
-from dataclasses import dataclass
 from typing import Any, Dict, Optional
 
 import requests
@@ -22,56 +21,39 @@
     replace_sdpa_with_custom_op,
 )
 from executorch.examples.models.model_base import EagerModelBase
-from llava.constants import (
-    DEFAULT_IM_END_TOKEN,
-    DEFAULT_IM_START_TOKEN,
-    DEFAULT_IMAGE_TOKEN,
-    IMAGE_PLACEHOLDER,
-    IMAGE_TOKEN_INDEX,
-)
-
-from llava.conversation import conv_templates
-
-from llava.mm_utils import get_model_name_from_path, tokenizer_image_token
-
-from llava.model.builder import load_pretrained_model
-
-from llava.model.llava_arch import LlavaMetaForCausalLM
-
-from llava.model.multimodal_encoder.clip_encoder import CLIPVisionTower
 from PIL import Image
 
 from torch import nn
 from torch.export import Dim
 from torchvision.transforms.v2 import functional as F
 
-from transformers import LlamaForCausalLM
-
-
-@dataclass
-class PreprocessConfig:
-    crop_size: dict
-    image_mean: list[float]
-    image_std: list[float]
-    rescale_factor: float
+from transformers import (
+    AutoProcessor,
+    CLIPImageProcessor,
+    LlamaForCausalLM,
+    LlavaForConditionalGeneration,
+)
 
 
 class Llava(torch.nn.Module):
     def __init__(
         self,
-        llava_model: LlavaMetaForCausalLM,
-        image_processor: CLIPVisionTower,
-        config: PreprocessConfig,
+        llava_model: LlavaForConditionalGeneration,
+        image_processor: CLIPImageProcessor,
         use_sdpa_with_kv_cache_op: bool = True,
     ):
         super().__init__()
         self.use_sdpa_with_kv_cache_op = use_sdpa_with_kv_cache_op
-        self.config = config
         self.model_ = llava_model
+        self.image_processor = image_processor
+        self.vision_feature_layer = self.model_.config.vision_feature_layer
+        self.vision_feature_select_strategy = (
+            self.model_.config.vision_feature_select_strategy
+        )
         self.text_model_args = ModelArgs(
             use_kv_cache=True,
-            vocab_size=self.model_.config.vocab_size,
-            hidden_dim=self.model_.config.intermediate_size,
+            vocab_size=self.model_.config.text_config.vocab_size,
+            hidden_dim=self.model_.config.text_config.intermediate_size,
             max_batch_size=1,  # doesn't work with default batch size 32
             ffn_dim_multiplier=1,  # TODO: a hack to make rotary embedding happy
             enable_dynamic_shape=True,  # allow parallel prefill
@@ -79,8 +61,8 @@ def __init__(
             use_hf_rope=True,
         )
         self.embed_tokens = nn.Embedding(
-            self.model_.config.vocab_size,
-            self.model_.config.hidden_size,
+            self.model_.config.text_config.vocab_size,
+            self.model_.config.text_config.hidden_size,
             self.model_.config.pad_token_id,
         )
         self.text_model = Transformer(self.text_model_args)
@@ -94,16 +76,13 @@ def __init__(
             assign=True,
         )
         self.embed_tokens.load_state_dict(
-            state_dict=self.get_model().embed_tokens.state_dict(),
+            state_dict=self.model_.language_model.model.embed_tokens.state_dict(),
             strict=True,
             assign=True,
         )
-        self.image_processor = image_processor
-        self.vision_tower = self.get_model().vision_tower
-        self.mm_projector = self.get_model().mm_projector
 
     def _translate_state_dict_for_text_model(self) -> Dict[str, Any]:
-        state_dict = self.model_.state_dict()
+        state_dict = self.model_.language_model.state_dict()
         key_map = {
             # fmt: off
             r"model.layers.([0-9]+).self_attn.q_proj.": r"layers.\1.attention.wq.",
@@ -138,13 +117,42 @@ def get_new_key(old_key: str) -> str:
 
         return new_state_dict
 
+    def _feature_select(self, image_outputs):
+        selected_image_feature = image_outputs.hidden_states[self.vision_feature_layer]
+
+        if self.vision_feature_select_strategy == "default":
+            selected_image_feature = selected_image_feature[:, 1:]
+        elif self.vision_feature_select_strategy == "full":
+            selected_image_feature = selected_image_feature
+        else:
+            raise ValueError(
+                f"Unexpected select feature: {self.vision_feature_select_strategy}"
+            )
+        return selected_image_feature
+
     def get_model(self):
         return self.model_.get_model()
 
     def encode_images(self, images: torch.Tensor) -> torch.Tensor:
-        images = images.to(dtype=self.get_model().dtype)
-        image_features = self.vision_tower(images)
-        image_features = self.mm_projector(image_features)
+        images = images.to(dtype=self.model_.dtype)
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.model_.vision_tower(
+                    image.to(
+                        device=self.model_.device, dtype=self.model_.dtype
+                    ).unsqueeze(0),
+                    output_hidden_states=True,
+                )
+                image_feature = self._feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.model_.vision_tower(
+                images.to(device=self.model_.device, dtype=self.model_.dtype),
+                output_hidden_states=True,
+            )
+            image_features = self._feature_select(image_forward_outs).to(images.dtype)
+        image_features = self.model_.multi_modal_projector(image_features)
         return image_features
 
     def image_preprocess(self, img: torch.Tensor) -> torch.Tensor:
@@ -177,9 +185,11 @@ def image_preprocess(self, img: torch.Tensor) -> torch.Tensor:
         # print(resized.shape)
         # cropped = F.center_crop(img, output_size=[w, w])
         # print(cropped.shape)
-        scaled = resized * self.config.rescale_factor
+        scaled = resized * self.image_processor.rescale_factor
         # print(scaled)
-        normed = F.normalize(scaled, self.config.image_mean, self.config.image_std)
+        normed = F.normalize(
+            scaled, self.image_processor.image_mean, self.image_processor.image_std
+        )
         # print(normed)
         return normed.unsqueeze(0)
 
@@ -225,7 +235,7 @@ def prefill_ref(
         """Avoiding the torch.where() call to find <image> placeholder and insert image embedding. Taking 3 inputs instead."""
         embeds = self.prefill_embedding(prompt_before_image, images, prompt_after_image)
         return LlamaForCausalLM.forward(
-            self.model_,
+            self.model_.language_model,
             inputs_embeds=embeds,
             return_dict=False,
             use_cache=False,
@@ -239,82 +249,24 @@ def forward(
         return self.image_embedding(images)
 
 
-def get_prompt(query: str, mm_use_im_start_end: bool, model_name: str) -> str:
-    qs = query
-    image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
-    if IMAGE_PLACEHOLDER in qs:
-        if mm_use_im_start_end:
-            qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
-        else:
-            qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
-    else:
-        if mm_use_im_start_end:
-            qs = image_token_se + "\n" + qs
-        else:
-            qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
-
-    def get_conv_mode(model_name: str) -> str:
-        if "llama-2" in model_name.lower():
-            conv_mode = "llava_llama_2"
-        elif "mistral" in model_name.lower():
-            conv_mode = "mistral_instruct"
-        elif "v1.6-34b" in model_name.lower():
-            conv_mode = "chatml_direct"
-        elif "v1" in model_name.lower():
-            conv_mode = "llava_v1"
-        elif "mpt" in model_name.lower():
-            conv_mode = "mpt"
-        else:
-            conv_mode = "llava_v0"
-        return conv_mode
-
-    conv = conv_templates[get_conv_mode(model_name)].copy()
-    conv.append_message(conv.roles[0], qs)
-    conv.append_message(conv.roles[1], None)
-    prompt = conv.get_prompt()
-    return prompt
-
-
 class LlavaModel(EagerModelBase):
     def __init__(self, use_sdpa_with_kv_cache_op=True):
         self.use_sdpa_with_kv_cache_op = use_sdpa_with_kv_cache_op
-        self.model_path = "liuhaotian/llava-v1.5-7b"
-        self.tokenizer, self.model, self.image_processor, context_len = (
-            load_pretrained_model(
-                model_path=self.model_path,
-                model_base=None,
-                model_name=get_model_name_from_path(self.model_path),
-                device_map="cpu",
-                device="cpu",
-            )
-        )
-        self.config = PreprocessConfig(
-            self.image_processor.crop_size,
-            self.image_processor.image_mean,
-            self.image_processor.image_std,
-            self.image_processor.rescale_factor,
+        self.processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+        self.tokenizer = self.processor.tokenizer
+        self.image_processor = self.processor.image_processor
+        self.model = LlavaForConditionalGeneration.from_pretrained(
+            "llava-hf/llava-1.5-7b-hf",
+            device_map="cpu",
         )
         self.image = Image.open(
             requests.get(
                 "https://llava-vl.github.io/static/images/view.jpg", stream=True
             ).raw
         )
-        self.args = type(
-            "Args",
-            (),
-            {
-                "model_path": self.model_path,
-                "model_base": None,
-                "model_name": get_model_name_from_path(self.model_path),
-                "query": "What are the things I should be cautious about when I visit here?",
-                "conv_mode": None,
-                "sep": ",",
-                "temperature": 0,
-                "top_p": None,
-                "num_beams": 1,
-                "max_new_tokens": 512,
-            },
-        )()
+        self.prompt = """A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>
+What are the things I should be cautious about when I visit here? ASSISTANT:"""
+        self.model_name = "llava-1.5-7b-hf"
         # set input to None and initialize them lazily
         self.input = None
         self.resized_image = None
@@ -323,7 +275,6 @@ def get_eager_model(self):
         model = Llava(
             self.model,
             self.image_processor,
-            self.config,
             self.use_sdpa_with_kv_cache_op,
         )
         model.to(dtype=torch.float32)
@@ -346,16 +297,8 @@ def get_inputs_for_prefill(self):
         """Returns prompts as well as image."""
         if self.input:
             return self.input
-        model_name = get_model_name_from_path(self.model_path)
-        self.prompt = get_prompt(self.args.query, False, model_name)
-        self.input_ids = (
-            tokenizer_image_token(
-                self.prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
-            )
-            .unsqueeze(0)
-            .cpu()
-        )
-        index = torch.where(self.input_ids == IMAGE_TOKEN_INDEX)[1]
+        self.input_ids = self.tokenizer.encode(self.prompt, return_tensors="pt").cpu()
+        index = torch.where(self.input_ids == self.model.config.image_token_index)[1]
         self.prompt_before_image = self.input_ids[:, :index]
         # print(prompt_before_image.shape)
         self.prompt_after_image = self.input_ids[:, index + 1 :]
diff --git a/examples/models/llava/test/test_llava.py b/examples/models/llava/test/test_llava.py
index ce0a527bc9..ef503a88fc 100644
--- a/examples/models/llava/test/test_llava.py
+++ b/examples/models/llava/test/test_llava.py
@@ -8,9 +8,20 @@
 import unittest
 
 import torch
+from executorch.examples.models.llava.export_llava import export_all
 
 from executorch.examples.models.llava.model import LlavaModel
 
+# import order matters. We need to import portable_lib first since it contains the static op registry
+# which will be used in the import of custom ops. Otherwise, the registration of custom ops will be skipped.
+# I don't know how to mute UFMT so I'm just using if True: to avoid the error
+if True:
+    from executorch.extension.pybindings.portable_lib import (
+        _load_for_executorch_from_buffer,
+    )
+from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa: F401
+
+
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
@@ -38,14 +49,14 @@ def test_generated_output(self):
         with torch.inference_mode():
             output_ids = self.llava_model.model.generate(
                 self.llava_model.input_ids,
-                images=preprocessed,
-                image_sizes=[preprocessed.size],
+                pixel_values=preprocessed,
                 do_sample=False,
                 num_beams=1,
                 max_new_tokens=5,
                 use_cache=True,
             )
-
+        # the output includes prompt, removing it
+        output_ids = output_ids[:, -5:]
         ref_outputs = self.llava_model.tokenizer.batch_decode(
             output_ids, skip_special_tokens=True
         )[0].strip()
@@ -66,3 +77,67 @@ def test_generated_output(self):
             torch.tensor([new_tokens]), skip_special_tokens=True
         )[0].strip()
         self.assertEqual(outputs, ref_outputs)
+
+    def test_llava_export(self):
+        # export llava and make sure e2e works
+        llava_model = LlavaModel(use_sdpa_with_kv_cache_op=True)
+
+        prompt_before_image, resized, prompt_after_image = (
+            llava_model.get_inputs_for_prefill()
+        )
+        executorch_program = export_all(llava_model)
+        llava_module = _load_for_executorch_from_buffer(executorch_program.buffer)
+
+        start_pos = 0
+        # pte prefill prompt before img
+        pte_embeds_before_img = llava_module.run_method(
+            "token_embedding", (prompt_before_image,)
+        )[0]
+        pte_prefill_before_img = llava_module.run_method(
+            "text_model",
+            (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_before_img),
+        )[0]
+
+        start_pos += pte_prefill_before_img.shape[1]
+
+        # pte prefill image
+        pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0]
+        pte_prefill_img = llava_module.run_method(
+            "text_model",
+            (
+                torch.tensor([start_pos], dtype=torch.int64),
+                pte_embeds_img,
+            ),
+        )[0]
+
+        start_pos += pte_prefill_img.shape[1]
+
+        # pte prefill prompt after img
+        pte_embeds_after_img = llava_module.run_method(
+            "token_embedding", (prompt_after_image,)
+        )[0]
+        pte_prefill_after_img = llava_module.run_method(
+            "text_model",
+            (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_after_img),
+        )[0]
+
+        # being tested, using llama_transformer
+        new_tokens = [torch.argmax(pte_prefill_after_img[..., -1, :]).item()]
+        # TODO: uncomment this line
+        # self.assertEquals(new_tokens[0], 1932)  # When
+        for i in range(4):
+            print(i, llava_model.tokenizer.decode(new_tokens[i]))
+            token_embeds = llava_module.run_method(
+                "token_embedding", (torch.tensor([[new_tokens[i]]], dtype=torch.int64),)
+            )[0]
+            logits = llava_module.run_method(
+                "text_model",
+                (torch.tensor([start_pos + i], dtype=torch.int64), token_embeds),
+            )[0]
+            new_tokens.append(torch.argmax(logits[..., -1, :]).item())
+
+        outputs = llava_model.tokenizer.batch_decode(
+            torch.tensor([new_tokens]), skip_special_tokens=True
+        )[0].strip()
+        print(outputs)
+        self.assertEqual(len(new_tokens), 5)
diff --git a/examples/models/phi3-mini-lora/README.md b/examples/models/phi3-mini-lora/README.md
index 97fef41776..d7f4b96c66 100644
--- a/examples/models/phi3-mini-lora/README.md
+++ b/examples/models/phi3-mini-lora/README.md
@@ -5,9 +5,8 @@ In this exmaple, we export a model ([phi-3-mini](https://github.com/pytorch/exec
 ### Step 1: [Optional] Install ExecuTorch dependencies
 `./install_requirements.sh` in ExecuTorch root directory.
 
-### Step 2: Install TorchTune nightly
-The LoRA model used is recent and is not yet officially released on `TorchTune`. To be able to run this example, you will need to run the following to install TorchTune nighly:
-- `./examples/models/llava_encoder/install_requirements.sh`'
+### Step 2: Install Requirements
+- `./examples/models/phi3-mini-lora/install_requirements.sh`
 
 ### Step 3: Export and run the model
 1. Export the model to ExecuTorch.
diff --git a/examples/models/phi3-mini-lora/install_requirements.sh b/examples/models/phi3-mini-lora/install_requirements.sh
old mode 100644
new mode 100755
index 20921d5d5d..ab73d8dac4
--- a/examples/models/phi3-mini-lora/install_requirements.sh
+++ b/examples/models/phi3-mini-lora/install_requirements.sh
@@ -5,6 +5,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Install nightly build of TorchTune.
-pip install --pre torchtune --extra-index-url https://download.pytorch.org/whl/nightly/cpu --no-cache-dir
+pip install torchvision
+pip install torchtune
 pip install tiktoken
diff --git a/examples/qualcomm/CMakeLists.txt b/examples/qualcomm/CMakeLists.txt
index 6bfbdea058..8e4c3a9b07 100644
--- a/examples/qualcomm/CMakeLists.txt
+++ b/examples/qualcomm/CMakeLists.txt
@@ -49,9 +49,7 @@ include(${EXECUTORCH_SRCS_FILE})
 get_filename_component(
   EXECUTORCH_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}/../.." ABSOLUTE
 )
-set(_qnn_executor_runner__srcs ${_executor_runner__srcs})
-set(_qnn_llama_runner__srcs ${_llama_runner__srcs})
-set(_qnn_qaihub_llama_runner__srcs ${_llama_runner__srcs})
+
 
 # portable_ops_lib
 gen_selected_ops(LIB_NAME "full_portable_ops_lib" INCLUDE_ALL_OPS "ON")
@@ -69,71 +67,17 @@ target_include_directories(
   full_portable_ops_lib PUBLIC ${_common_include_directories}
 )
 
-# preprocess executor runner src files
-list(TRANSFORM _qnn_executor_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/")
-list(FILTER _qnn_executor_runner__srcs EXCLUDE REGEX ".*executor_runner.cpp$")
-list(PREPEND _qnn_executor_runner__srcs
-  ${CMAKE_CURRENT_LIST_DIR}/executor_runner/qnn_executor_runner.cpp
-)
-# preprocess llama runner src files
-list(TRANSFORM _qnn_llama_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/")
-list(FILTER _qnn_llama_runner__srcs EXCLUDE REGEX ".*runner.cpp$")
-list(PREPEND _qnn_llama_runner__srcs
-  ${CMAKE_CURRENT_LIST_DIR}/executor_runner/qnn_llama_runner.cpp
-  ${CMAKE_CURRENT_LIST_DIR}/llama2/runner/runner.cpp
-  ${CMAKE_CURRENT_LIST_DIR}/llama2/runner/runner.h
-)
-# preprocess qaihub llama runner src files
-list(TRANSFORM _qnn_qaihub_llama_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/")
-list(FILTER _qnn_qaihub_llama_runner__srcs EXCLUDE REGEX ".*runner.cpp*$")
-list(PREPEND _qnn_qaihub_llama_runner__srcs
-  ${CMAKE_CURRENT_LIST_DIR}/executor_runner/qnn_qaihub_llama_runner.cpp
-  ${CMAKE_CURRENT_LIST_DIR}/llama2/qaihub_runner/runner.cpp
-  ${CMAKE_CURRENT_LIST_DIR}/llama2/qaihub_runner/runner.h
-  ${CMAKE_CURRENT_LIST_DIR}/llama2/qaihub_runner/io_memory.cpp
-  ${CMAKE_CURRENT_LIST_DIR}/llama2/qaihub_runner/io_memory.h
+# build qnn_executor_runner
+add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/executor_runner
 )
 
-# build executor runner
-add_executable(qnn_executor_runner ${_qnn_executor_runner__srcs})
-target_include_directories(
-  qnn_executor_runner PUBLIC ${_common_include_directories}
-)
-target_link_libraries(
-  qnn_executor_runner qnn_executorch_backend full_portable_ops_lib etdump
-  ${FLATCCRT_LIB} gflags
+# build qnn_llama_runner
+add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/oss_scripts/llama2
 )
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-  target_link_options(qnn_executor_runner PUBLIC -fsanitize=undefined)
-endif()
 
-# build llama runner
-add_executable(qnn_llama_runner ${_qnn_llama_runner__srcs})
-target_include_directories(
-  qnn_llama_runner PUBLIC ${_common_include_directories}
-)
-target_link_libraries(qnn_llama_runner
-  qnn_executorch_backend
-  full_portable_ops_lib
-  extension_data_loader
-  extension_module
-  gflags
-)
-target_compile_options(qnn_llama_runner
-  PUBLIC ${_common_compile_options}
-)
-# build qaihub llama runner
-add_executable(qnn_qaihub_llama_runner ${_qnn_qaihub_llama_runner__srcs})
-target_include_directories(qnn_qaihub_llama_runner
-  PUBLIC ${_common_include_directories}
-)
-target_link_libraries(qnn_qaihub_llama_runner
-  qnn_executorch_backend
-  executorch_no_prim_ops
-  extension_data_loader
-  extension_module
-  gflags
-)
-target_compile_options(qnn_qaihub_llama_runner
-  PUBLIC ${_common_compile_options}
+# build qaihub_llama2_7b_runner
+add_subdirectory(
+  ${CMAKE_CURRENT_SOURCE_DIR}/qaihub_scripts/llama2
 )
diff --git a/examples/qualcomm/README.md b/examples/qualcomm/README.md
index d41ad80ecf..3e7a018ac7 100644
--- a/examples/qualcomm/README.md
+++ b/examples/qualcomm/README.md
@@ -2,6 +2,20 @@
 
 This directory contains examples for some AI models.
 
+We have seperated the example scripts into the following subfolders, please refer to [README.md](../../backends/qualcomm/README.md) for the example scripts' directory structure:
+
+1. executor_runner: This folder contains a general executor runner capable of running most of the models. As a rule of thumb, if a model does not have its own customized runner, execute the model using [executor_runner](./executor_runner/qnn_executor_runner.cpp). On the other hand, if a model has its own runner, such as [llama2](./oss_scripts/llama2/qnn_llama_runner.cpp), use the customized runner to execute the model. Customized runner should be located under the same folder as the model's python script. 
+
+2. oss_scripts: OSS stands for Open Source Software. This folder contains python scripts for open source models. Some models under this folder might also have their own customized runner.
+   For example, [llama2](./oss_scripts/llama2/qnn_llama_runner.cpp) contains not only the python scripts to prepare the model but also a customized runner for executing the model.
+
+3. qaihub_scripts: QAIHub stands for [Qualcomm AI Hub](https://aihub.qualcomm.com/). On QAIHub, users can find pre-compiled context binaries, a format used by QNN to save its models. This provides users with a new option for model deployment. Different from oss_scripts & scripts, which the example scripts are converting a model from nn.Module to ExecuTorch .pte files, qaihub_scripts provides example scripts for converting pre-compiled context binaries to ExecuTorch .pte files. Additionaly, users can find customized example runners specific to the QAIHub models for execution. For example [qaihub_llama2_7b](./qaihub_scripts/llama2/qaihub_llama2_7b.py) is a script converting context binaries to ExecuTorch .pte files, and [qaihub_llama2_7b_runner](./qaihub_scripts/llama2/qaihub_llama2_7b_runner.cpp) is a customized example runner to execute llama2 .pte files. Please be aware that context-binaries downloaded from QAIHub are tied to a specific QNN SDK version.
+Before executing the scripts and runner, please ensure that you are using the QNN SDK version that is matching the context binary. Tutorial below will also cover how to check the QNN Version for a context binary.
+
+4. scripts: This folder contains scripts to build models provided by executorch.
+
+
+
 Please check helper of each examples for detailed arguments.
 
 Here are some general information and limitations.
@@ -10,13 +24,13 @@ Here are some general information and limitations.
 
 Please finish tutorial [Setting up executorch](https://pytorch.org/executorch/stable/getting-started-setup).
 
-Please finish [setup QNN backend](../../backends/qualcomm/setup.md).
+Please finish [setup QNN backend](../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md).
 
 ## Environment
 
 Please set up `QNN_SDK_ROOT` environment variable.
 Note that this version should be exactly same as building QNN backend.
-Please check [setup](../../backends/qualcomm/setup.md).
+Please check [setup](../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md).
 
 Please set up `LD_LIBRARY_PATH` to `$QNN_SDK_ROOT/lib/x86_64-linux-clang`.
 Or, you could put QNN libraries to default search path of the dynamic linker.
@@ -39,12 +53,23 @@ cd $EXECUTORCH_ROOT/examples/qualcomm/scripts
 
 #### For MobileNet_v2
 ```bash
-python mobilenet_v2.py -s <device_serial> -m "SM8550" -b path/to/build_android/ -d /path/to/imagenet-mini/val
+python mobilenet_v2.py -s <device_serial> -m "SM8550" -b path/to/cmake-out-android/ -d /path/to/imagenet-mini/val
 ```
 
 #### For DeepLab_v3
 ```bash
-python deeplab_v3.py -s <device_serial> -m "SM8550" -b path/to/build_android/ --download
+python deeplab_v3.py -s <device_serial> -m "SM8550" -b path/to/cmake-out-android/ --download
+```
+
+#### Check context binary version
+```bash
+cd ${QNN_SDK_ROOT}/bin/x86_64-linux-clang
+./qnn-context-binary-utility --context_binary ${PATH_TO_CONTEXT_BINARY} --json_file ${OUTPUT_JSON_NAME}
+```
+After retreiving the json file, search in the json file for the field "buildId" and ensure it matches the ${QNN_SDK_ROOT} you are using for the environment variable.
+If you run into the following error, that means the ${QNN_SDK_ROOT} that you are using is older than the context binary QNN SDK version. In this case, please download a newer QNN SDK version.
+```
+Error: Failed to get context binary info.
 ```
 
 ## Additional Dependency
diff --git a/examples/qualcomm/executor_runner/CMakeLists.txt b/examples/qualcomm/executor_runner/CMakeLists.txt
new file mode 100644
index 0000000000..73106d9368
--- /dev/null
+++ b/examples/qualcomm/executor_runner/CMakeLists.txt
@@ -0,0 +1,22 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set(_qnn_executor_runner__srcs ${_executor_runner__srcs})
+
+# preprocess executor runner src files
+list(TRANSFORM _qnn_executor_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/")
+list(FILTER _qnn_executor_runner__srcs EXCLUDE REGEX ".*executor_runner.cpp$")
+list(PREPEND _qnn_executor_runner__srcs ${CMAKE_CURRENT_LIST_DIR}/qnn_executor_runner.cpp)
+
+# build executor runner
+add_executable(qnn_executor_runner ${_qnn_executor_runner__srcs})
+target_include_directories(
+  qnn_executor_runner PUBLIC ${_common_include_directories}
+)
+target_link_libraries(
+  qnn_executor_runner qnn_executorch_backend full_portable_ops_lib etdump
+  ${FLATCCRT_LIB} gflags
+)
diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
index 7871cafc24..7cd3709b95 100644
--- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
+++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
@@ -404,7 +404,15 @@ int main(int argc, char** argv) {
         elapsed_time,
         elapsed_time / inference_index);
   } else {
-    // if no input is provided, run with default input as executor_runner.
+    // if no input is provided, fill the inputs with default values
+    auto inputs = util::prepare_input_tensors(*method);
+    ET_CHECK_MSG(
+        inputs.ok(),
+        "Could not prepare inputs: 0x%" PRIx32,
+        (uint32_t)inputs.error());
+    ET_LOG(
+        Info,
+        "Input list not provided. Inputs prepared with default values set.");
     Error status = method->execute();
     ET_CHECK_MSG(
         status == Error::Ok,
diff --git a/examples/qualcomm/llama2/README.md b/examples/qualcomm/llama2/README.md
deleted file mode 100644
index 4670f74251..0000000000
--- a/examples/qualcomm/llama2/README.md
+++ /dev/null
@@ -1,67 +0,0 @@
-# Summary
-
-## Overview
-This file provides you the instructions to run LLAMA2 with different parameters via Qualcomm HTP backend. Following settings support for
-1. Stories 110M
-2. Llama-2-7b-chat-hf
-
-Please check corresponding section for more information.
-
-## Stories 110M
-This example demonstrates how to run a smaller LLAMA2, stories110M on mobile via Qualcomm HTP backend. Model architecture is fine-tuned specifically for HTP to accelerate the performance. Weight is quantized via PTQ quantization to fit the model on a phone.
-
-### Instructions
-#### Step 1: Setup
-1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
-2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend.
-
-#### Step2: Prepare Model
-Download and preapre stories110M model
-
-```bash
-# tokenizer.model & stories110M.pt:
-wget "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt"
-wget "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model"
-
-# tokenizer.bin:
-python -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
-
-# params.json:
-echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
-```
-
-#### Step3: Run default examples
-Default example generates the story based on the given prompt, "Once".
-```bash
-# 16a4w quant:
-python examples/qualcomm/llama2/llama.py -a ${ARTIFACTS} -b build_android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint stories110M --params params.json --tokenizer_model tokenizer.model --tokenizer_bin tokenizer.bin --prompt "Once"
-```
-
-#### (Note) Customized PTQ data set
-User prompts are used for PTQ calibration data. Take the examples above, the word "Once" is the only word for PTQ. If you want to observe more data during the calibration time. Please add more prompts to the args `--prompt`.
-
-
-## Llama-2-7b-chat-hf
-This example demonstrates how to run Llama-2-7b-chat-hf on mobile via Qualcomm HTP backend. Model was precompiled into context binaries by [Qualcomm AI HUB](https://aihub.qualcomm.com/).
-Note that the pre-compiled context binaries could not be futher fine-tuned for other downstream tasks.
-
-### Instructions
-#### Step 1: Setup
-1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
-2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend.
-
-#### Step2: Prepare Model
-1. Create account for https://aihub.qualcomm.com/
-2. Follow instructions in https://huggingface.co/qualcomm/Llama-v2-7B-Chat to export context binaries (will take some time to finish)
-
-```bash
-# tokenizer.model: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/main/tokenizer.model
-# tokenizer.bin:
-python -m examples.models.llama2.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
-```
-
-#### Step3: Run default examples
-```bash
-# AIHUB_CONTEXT_BINARIES: ${PATH_TO_AIHUB_WORKSPACE}/build/llama_v2_7b_chat_quantized
-python examples/qualcomm/llama2/llama_qaihub.py -a ${ARTIFACTS} -b build_android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_bin tokenizer.bin --prompt "What is Python?"
-```
diff --git a/examples/qualcomm/oss_scripts/dino_v2.py b/examples/qualcomm/oss_scripts/dino_v2.py
index 03249b63d8..b3fecfbbe6 100644
--- a/examples/qualcomm/oss_scripts/dino_v2.py
+++ b/examples/qualcomm/oss_scripts/dino_v2.py
@@ -13,7 +13,7 @@
 import torch
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 
-from executorch.examples.qualcomm.scripts.utils import (
+from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
     make_output_dir,
     parse_skip_delegation_node,
diff --git a/examples/qualcomm/oss_scripts/esrgan.py b/examples/qualcomm/oss_scripts/esrgan.py
index e4e609e152..56871db764 100644
--- a/examples/qualcomm/oss_scripts/esrgan.py
+++ b/examples/qualcomm/oss_scripts/esrgan.py
@@ -15,7 +15,7 @@
 
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 from executorch.examples.qualcomm.scripts.edsr import get_dataset
-from executorch.examples.qualcomm.scripts.utils import (
+from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
     make_output_dir,
     parse_skip_delegation_node,
diff --git a/examples/qualcomm/oss_scripts/fbnet.py b/examples/qualcomm/oss_scripts/fbnet.py
index fe07ab83d2..495b08b413 100755
--- a/examples/qualcomm/oss_scripts/fbnet.py
+++ b/examples/qualcomm/oss_scripts/fbnet.py
@@ -14,7 +14,7 @@
 import timm
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 from executorch.examples.qualcomm.scripts.inception_v4 import get_dataset
-from executorch.examples.qualcomm.scripts.utils import (
+from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
     make_output_dir,
     setup_common_args_and_variables,
diff --git a/examples/qualcomm/oss_scripts/gMLP_image_classification.py b/examples/qualcomm/oss_scripts/gMLP_image_classification.py
index e9b9b91507..864a9b919f 100644
--- a/examples/qualcomm/oss_scripts/gMLP_image_classification.py
+++ b/examples/qualcomm/oss_scripts/gMLP_image_classification.py
@@ -15,7 +15,7 @@
 import torch
 
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
-from executorch.examples.qualcomm.scripts.utils import (
+from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
     make_output_dir,
     parse_skip_delegation_node,
diff --git a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt
new file mode 100644
index 0000000000..2f13f017d3
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt
@@ -0,0 +1,33 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set(_qnn_llama_runner__srcs ${_llama_runner__srcs})
+
+# preprocess qnn llama runner src files
+list(TRANSFORM _qnn_llama_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/")
+list(FILTER _qnn_llama_runner__srcs EXCLUDE REGEX ".*(/runner/).*")
+list(PREPEND _qnn_llama_runner__srcs
+  ${CMAKE_CURRENT_LIST_DIR}/qnn_llama_runner.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/runner.h
+)
+
+
+# build qnn llama runner
+add_executable(qnn_llama_runner ${_qnn_llama_runner__srcs})
+target_include_directories(
+  qnn_llama_runner PUBLIC ${_common_include_directories}
+)
+target_link_libraries(qnn_llama_runner
+  qnn_executorch_backend
+  full_portable_ops_lib
+  extension_data_loader
+  extension_module
+  gflags
+)
+target_compile_options(qnn_llama_runner
+  PUBLIC ${_common_compile_options}
+)
diff --git a/examples/qualcomm/oss_scripts/llama2/README.md b/examples/qualcomm/oss_scripts/llama2/README.md
new file mode 100644
index 0000000000..ec15545a6f
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama2/README.md
@@ -0,0 +1,39 @@
+# Summary
+
+## Overview
+This file provides you the instructions to run LLAMA2 with different parameters via Qualcomm HTP backend. Following settings support for Stories 110M
+
+Please check corresponding section for more information.
+
+## Stories 110M
+This example demonstrates how to run a smaller LLAMA2, stories110M on mobile via Qualcomm HTP backend. Model architecture is fine-tuned specifically for HTP to accelerate the performance. Weight is quantized via PTQ quantization to fit the model on a phone.
+
+### Instructions
+#### Step 1: Setup
+1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
+2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend.
+
+#### Step2: Prepare Model
+Download and preapre stories110M model
+
+```bash
+# tokenizer.model & stories110M.pt:
+wget "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt"
+wget "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model"
+
+# tokenizer.bin:
+python -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+
+# params.json:
+echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
+```
+
+#### Step3: Run default examples
+Default example generates the story based on the given prompt, "Once".
+```bash
+# 16a4w quant:
+python examples/qualcomm/oss_scripts/llama2/llama.py -a ${ARTIFACTS} -b cmake-out-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint stories110M --params params.json --tokenizer_model tokenizer.model --tokenizer_bin tokenizer.bin --prompt "Once"
+```
+
+#### (Note) Customized PTQ data set
+User prompts are used for PTQ calibration data. Take the examples above, the word "Once" is the only word for PTQ. If you want to observe more data during the calibration time. Please add more prompts to the args `--prompt`.
\ No newline at end of file
diff --git a/examples/qualcomm/llama2/llama.py b/examples/qualcomm/oss_scripts/llama2/llama.py
similarity index 98%
rename from examples/qualcomm/llama2/llama.py
rename to examples/qualcomm/oss_scripts/llama2/llama.py
index 6e0f3f4399..087296b15b 100644
--- a/examples/qualcomm/llama2/llama.py
+++ b/examples/qualcomm/oss_scripts/llama2/llama.py
@@ -28,8 +28,11 @@
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
 )
-from executorch.examples.qualcomm.llama2.model.static_llama import LlamaModel, ModelArgs
-from executorch.examples.qualcomm.scripts.utils import (
+from executorch.examples.qualcomm.oss_scripts.llama2.model.static_llama import (
+    LlamaModel,
+    ModelArgs,
+)
+from executorch.examples.qualcomm.utils import (
     make_output_dir,
     setup_common_args_and_variables,
     SimpleADB,
@@ -453,7 +456,7 @@ def inference(args, pre_gen_pte=""):
         host_id=args.host,
         soc_model=args.model,
         shared_buffer=args.shared_buffer,
-        runner="examples/qualcomm/qnn_llama_runner",
+        runner="examples/qualcomm/oss_scripts/llama2/qnn_llama_runner",
     )
     # No pregen inputs, input_list is not required
     adb.push(inputs=[], input_list="", files=[args.tokenizer_bin])
diff --git a/examples/qualcomm/llama2/model/static_llama.py b/examples/qualcomm/oss_scripts/llama2/model/static_llama.py
similarity index 100%
rename from examples/qualcomm/llama2/model/static_llama.py
rename to examples/qualcomm/oss_scripts/llama2/model/static_llama.py
diff --git a/examples/qualcomm/executor_runner/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp
similarity index 97%
rename from examples/qualcomm/executor_runner/qnn_llama_runner.cpp
rename to examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp
index 0d654e6836..17a22bb30a 100644
--- a/examples/qualcomm/executor_runner/qnn_llama_runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp
@@ -16,7 +16,7 @@
  */
 
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
-#include <executorch/examples/qualcomm/llama2/runner/runner.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama2/runner/runner.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 #include <executorch/runtime/platform/log.h>
 
diff --git a/examples/qualcomm/llama2/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp
similarity index 99%
rename from examples/qualcomm/llama2/runner/runner.cpp
rename to examples/qualcomm/oss_scripts/llama2/runner/runner.cpp
index 009bb6b209..d452336175 100644
--- a/examples/qualcomm/llama2/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp
@@ -9,20 +9,19 @@
 // A simple llama2 runner that includes preprocessing and post processing logic.
 // The module takes in a string as input and emits a string as output.
 
-#include <executorch/examples/qualcomm/llama2/runner/runner.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama2/runner/runner.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/extension/llm/runner/util.h>
 #include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/platform/log.h>
 
 #include <ctime>
 #include <memory>
 #include <sstream>
 
-#include <executorch/examples/models/llama2/runner/util.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
-#include <executorch/runtime/platform/log.h>
-
 namespace torch {
 namespace executor {
 
diff --git a/examples/qualcomm/llama2/runner/runner.h b/examples/qualcomm/oss_scripts/llama2/runner/runner.h
similarity index 100%
rename from examples/qualcomm/llama2/runner/runner.h
rename to examples/qualcomm/oss_scripts/llama2/runner/runner.h
diff --git a/examples/qualcomm/oss_scripts/squeezenet.py b/examples/qualcomm/oss_scripts/squeezenet.py
index bc000c6938..820f23d119 100644
--- a/examples/qualcomm/oss_scripts/squeezenet.py
+++ b/examples/qualcomm/oss_scripts/squeezenet.py
@@ -13,7 +13,7 @@
 
 import torch
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
-from executorch.examples.qualcomm.scripts.utils import (
+from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
     make_output_dir,
     parse_skip_delegation_node,
diff --git a/examples/qualcomm/oss_scripts/ssd300_vgg16.py b/examples/qualcomm/oss_scripts/ssd300_vgg16.py
index 8fdb896e09..bd5089441e 100644
--- a/examples/qualcomm/oss_scripts/ssd300_vgg16.py
+++ b/examples/qualcomm/oss_scripts/ssd300_vgg16.py
@@ -14,7 +14,7 @@
 import torch
 
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
-from executorch.examples.qualcomm.scripts.utils import (
+from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
     make_output_dir,
     parse_skip_delegation_node,
diff --git a/examples/qualcomm/qaihub_scripts/llama2/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/llama2/CMakeLists.txt
new file mode 100644
index 0000000000..a6efc56ba9
--- /dev/null
+++ b/examples/qualcomm/qaihub_scripts/llama2/CMakeLists.txt
@@ -0,0 +1,35 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# preprocess qaihub llama2 7b runner src files
+
+set(_qaihub_llama2_7b_runner__srcs ${_llama_runner__srcs})
+
+list(TRANSFORM _qaihub_llama2_7b_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/")
+list(FILTER _qaihub_llama2_7b_runner__srcs EXCLUDE REGEX ".*(/runner/).*")
+list(PREPEND _qaihub_llama2_7b_runner__srcs
+  ${CMAKE_CURRENT_LIST_DIR}/qaihub_llama2_7b_runner.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/runner.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/io_memory.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/io_memory.h
+)
+
+# build qaihub llama2 7b runner
+add_executable(qaihub_llama2_7b_runner ${_qaihub_llama2_7b_runner__srcs})
+target_include_directories(qaihub_llama2_7b_runner
+  PUBLIC ${_common_include_directories}
+)
+target_link_libraries(qaihub_llama2_7b_runner
+  qnn_executorch_backend
+  executorch_no_prim_ops
+  extension_data_loader
+  extension_module
+  gflags
+)
+target_compile_options(qaihub_llama2_7b_runner
+  PUBLIC ${_common_compile_options}
+)
diff --git a/examples/qualcomm/qaihub_scripts/llama2/README.md b/examples/qualcomm/qaihub_scripts/llama2/README.md
new file mode 100644
index 0000000000..790a2fe4e3
--- /dev/null
+++ b/examples/qualcomm/qaihub_scripts/llama2/README.md
@@ -0,0 +1,31 @@
+# Summary
+
+## Overview
+This file provides you the instructions to run LLAMA2 with different parameters via Qualcomm HTP backend. Following settings support for Llama-2-7b-chat-hf
+
+Please check corresponding section for more information.
+
+## Llama-2-7b-chat-hf
+This example demonstrates how to run Llama-2-7b-chat-hf on mobile via Qualcomm HTP backend. Model was precompiled into context binaries by [Qualcomm AI HUB](https://aihub.qualcomm.com/).
+Note that the pre-compiled context binaries could not be futher fine-tuned for other downstream tasks.
+
+### Instructions
+#### Step 1: Setup
+1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
+2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend.
+
+#### Step2: Prepare Model
+1. Create account for https://aihub.qualcomm.com/
+2. Follow instructions in https://huggingface.co/qualcomm/Llama-v2-7B-Chat to export context binaries (will take some time to finish)
+
+```bash
+# tokenizer.model: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/main/tokenizer.model
+# tokenizer.bin:
+python -m examples.models.llama2.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+```
+
+#### Step3: Run default examples
+```bash
+# AIHUB_CONTEXT_BINARIES: ${PATH_TO_AIHUB_WORKSPACE}/build/llama_v2_7b_chat_quantized
+python examples/qualcomm/qaihub_scripts/llama2/qaihub_llama2_7b.py -a ${ARTIFACTS} -b cmake-out-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_bin tokenizer.bin --prompt "What is Python?"
+```
\ No newline at end of file
diff --git a/examples/qualcomm/llama2/llama_qaihub.py b/examples/qualcomm/qaihub_scripts/llama2/qaihub_llama2_7b.py
similarity index 85%
rename from examples/qualcomm/llama2/llama_qaihub.py
rename to examples/qualcomm/qaihub_scripts/llama2/qaihub_llama2_7b.py
index b5dd6ab458..b3f7e20dd2 100644
--- a/examples/qualcomm/llama2/llama_qaihub.py
+++ b/examples/qualcomm/qaihub_scripts/llama2/qaihub_llama2_7b.py
@@ -5,10 +5,11 @@
 # LICENSE file in the root directory of this source tree.
 
 import gc
+import json
 import os
+from multiprocessing.connection import Client
 
 import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManagerAdaptor
-
 import torch
 from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (  # noqa: F401
     QcomChipset,
@@ -20,7 +21,7 @@
     generate_qnn_executorch_compiler_spec,
     generate_qnn_executorch_option,
 )
-from executorch.examples.qualcomm.scripts.utils import (
+from executorch.examples.qualcomm.utils import (
     setup_common_args_and_variables,
     SimpleADB,
 )
@@ -28,74 +29,17 @@
 from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
 
 
-def main():
-    parser = setup_common_args_and_variables()
-
-    parser.add_argument(
-        "-a",
-        "--artifact",
-        help="path for storing generated artifacts by this example. Default ./llama2_qai_hub",
-        default="./llama2_qai_hub",
-        type=str,
-    )
-
-    parser.add_argument(
-        "--context_binaries",
-        help="path to context binaries generated from qai_hub",
-        required=True,
-    )
+def main(args):
+    os.makedirs(args.artifact, exist_ok=True)
 
-    parser.add_argument(
-        "--use_prompt_processor",
-        help="tokens will be evaluated all at once",
-        default=False,
-        action="store_true",
-    )
-
-    parser.add_argument(
-        "--tokenizer_bin",
-        help="llama2 tokenizer binary",
-        required=True,
-        type=str,
-    )
-
-    parser.add_argument(
-        "--seq_len",
-        help="ouput sequence length for llama2",
-        default=128,
-        type=int,
-    )
-
-    parser.add_argument(
-        "--temperature",
-        help="sampling temperature for llama2",
-        default=0.8,
-        type=float,
-    )
-
-    parser.add_argument(
-        "--prompt",
-        help="user prompts for llama2",
-        required=True,
-        type=str,
-    )
-
-    parser.add_argument(
-        "--pre_gen_pte",
-        help="folder path to pre-compiled ptes",
-        default=None,
-        type=str,
-    )
-
-    args = parser.parse_args()
     target_names = (
         [
-            f"llama_v2_7b_chat_quantized_Llama2_PromptProcessor_{i}_Quantized.bin"
+            f"llama_v2_7b_chat_quantized_PromptProcessor_{i}_Quantized.bin"
             for i in range(1, 5)
         ]
         if args.use_prompt_processor
         else [
-            f"llama_v2_7b_chat_quantized_Llama2_TokenGenerator_{i}_Quantized.bin"
+            f"llama_v2_7b_chat_quantized_TokenGenerator_{i}_Quantized.bin"
             for i in range(1, 5)
         ]
     )
@@ -127,6 +71,7 @@ def main():
         # export pte files
         pte_name, pte_files = "qaihub_llama7b", []
         for i in range(len(target_names)):
+            print(f"pte {i} generating...")
             memory_planning_pass = MemoryPlanningPass(
                 memory_planning_algo="greedy",
                 alloc_graph_input=False,
@@ -135,7 +80,10 @@ def main():
             pte_files.append(f"{args.artifact}/{pte_name}_{i}.pte")
             with open(pte_files[-1], "wb") as file:
                 file.write(
-                    lowered_modules[0].buffer(memory_planning=memory_planning_pass)
+                    lowered_modules[0].buffer(
+                        extract_delegate_segments=True,
+                        memory_planning=memory_planning_pass,
+                    )
                 )
             # gc for reducing host memory consuming
             bundle_programs.pop(0)
@@ -175,11 +123,12 @@ def get_logit_encoding(path_to_last_shard: str):
         device_id=args.device,
         host_id=args.host,
         soc_model=args.model,
-        runner="examples/qualcomm/qnn_qaihub_llama_runner",
+        runner="examples/qualcomm/qaihub_scripts/llama2/qaihub_llama2_7b_runner",
     )
     output_file = "result.txt"
     pos_embs_file = ["freq_cos", "freq_sin"]
     scale, offset = get_logit_encoding(target_names[-1])
+    outputs = []
     runner_args = [
         *[
             f"--sharded_{i+1}_path {os.path.basename(pte_file)}"
@@ -200,7 +149,7 @@ def get_logit_encoding(path_to_last_shard: str):
             f"cd {adb.workspace} &&",
             "export ADSP_LIBRARY_PATH=. &&",
             "export LD_LIBRARY_PATH=. &&",
-            f"./qnn_qaihub_llama_runner {' '.join(runner_args)}",
+            f"./qaihub_llama2_7b_runner {' '.join(runner_args)}",
         ]
     )
 
@@ -217,7 +166,7 @@ def compute_pos_embedding():
 
     def post_process():
         with open(f"{args.artifact}/outputs/{output_file}", "r") as f:
-            print(f.read())
+            outputs.append(f.read())
 
     custom_files = [args.tokenizer_bin]
     for var_name, freq in zip(pos_embs_file, compute_pos_embedding()):
@@ -229,7 +178,85 @@ def post_process():
     adb.push(files=custom_files)
     adb.execute(custom_runner_cmd=runner_cmds)
     adb.pull(args.artifact, callback=post_process)
+    if args.ip and args.port != -1:
+        with Client((args.ip, args.port)) as conn:
+            conn.send(
+                json.dumps(
+                    {
+                        "result": outputs[0],
+                    }
+                )
+            )
+    else:
+        print(outputs[0])
 
 
 if __name__ == "__main__":
-    main()
+    parser = setup_common_args_and_variables()
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. Default ./llama2_qai_hub",
+        default="./llama2_qai_hub",
+        type=str,
+    )
+
+    parser.add_argument(
+        "--context_binaries",
+        help="path to context binaries generated from qai_hub",
+        required=True,
+    )
+
+    parser.add_argument(
+        "--use_prompt_processor",
+        help="tokens will be evaluated all at once",
+        default=False,
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--tokenizer_bin",
+        help="llama2 tokenizer binary",
+        required=True,
+        type=str,
+    )
+
+    parser.add_argument(
+        "--seq_len",
+        help="ouput sequence length for llama2",
+        default=128,
+        type=int,
+    )
+
+    parser.add_argument(
+        "--temperature",
+        help="sampling temperature for llama2",
+        default=0.8,
+        type=float,
+    )
+
+    parser.add_argument(
+        "--prompt",
+        help="user prompts for llama2",
+        required=True,
+        type=str,
+    )
+
+    parser.add_argument(
+        "--pre_gen_pte",
+        help="folder path to pre-compiled ptes",
+        default=None,
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/executor_runner/qnn_qaihub_llama_runner.cpp b/examples/qualcomm/qaihub_scripts/llama2/qaihub_llama2_7b_runner.cpp
similarity index 97%
rename from examples/qualcomm/executor_runner/qnn_qaihub_llama_runner.cpp
rename to examples/qualcomm/qaihub_scripts/llama2/qaihub_llama2_7b_runner.cpp
index 60e0b66a9b..7601b6027e 100644
--- a/examples/qualcomm/executor_runner/qnn_qaihub_llama_runner.cpp
+++ b/examples/qualcomm/qaihub_scripts/llama2/qaihub_llama2_7b_runner.cpp
@@ -17,7 +17,7 @@
  */
 
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
-#include <executorch/examples/qualcomm/llama2/qaihub_runner/runner.h>
+#include <executorch/examples/qualcomm/qaihub_scripts/llama2/runner/runner.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 #include <executorch/runtime/platform/log.h>
 
diff --git a/examples/qualcomm/llama2/qaihub_runner/io_memory.cpp b/examples/qualcomm/qaihub_scripts/llama2/runner/io_memory.cpp
similarity index 98%
rename from examples/qualcomm/llama2/qaihub_runner/io_memory.cpp
rename to examples/qualcomm/qaihub_scripts/llama2/runner/io_memory.cpp
index c7e3df9996..f808b20c35 100644
--- a/examples/qualcomm/llama2/qaihub_runner/io_memory.cpp
+++ b/examples/qualcomm/qaihub_scripts/llama2/runner/io_memory.cpp
@@ -8,7 +8,7 @@
 
 #include <fstream>
 
-#include <executorch/examples/qualcomm/llama2/qaihub_runner/io_memory.h>
+#include <executorch/examples/qualcomm/qaihub_scripts/llama2/runner/io_memory.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 
 namespace torch {
@@ -126,7 +126,8 @@ void BertMemory::prepare_io(
             hidden_state->dim_order().data()));
     // reuse inputs for following tensors
     for (int shard_index = 1; shard_index < 4; ++shard_index) {
-      // inpus of shard1,2,3: hidden_state, atten_mask, pos_ids_cos, pos_ids_sin
+      // inputs of shard1,2,3: hidden_state, atten_mask, pos_ids_cos,
+      // pos_ids_sin
       input_tensors_[shard_index].push_back(hidden_state_.get());
       input_tensors_[shard_index].push_back(attention_mask_.get());
       input_tensors_[shard_index].push_back(position_ids_cos_.get());
@@ -269,7 +270,7 @@ void KVCachedMemory::prepare_io(
         const_cast<TensorImpl::DimOrderType*>(pos_ids_sin->dim_order().data()));
     input_tensors_[0].push_back(position_ids_sin_.get());
     // [IO]: hidden_state => [I] shard2,3,4
-    int output_index = 8 * 2 * 32; // layres*(k + v caches)*heads
+    int output_index = 8 * 2 * 32; // layers*(k + v caches)*heads
     Result<TensorInfo> hidden_state =
         methods_meta[0]->output_tensor_meta(output_index);
     hidden_state_ = std::make_unique<TensorImpl>(
diff --git a/examples/qualcomm/llama2/qaihub_runner/io_memory.h b/examples/qualcomm/qaihub_scripts/llama2/runner/io_memory.h
similarity index 100%
rename from examples/qualcomm/llama2/qaihub_runner/io_memory.h
rename to examples/qualcomm/qaihub_scripts/llama2/runner/io_memory.h
diff --git a/examples/qualcomm/llama2/qaihub_runner/runner.cpp b/examples/qualcomm/qaihub_scripts/llama2/runner/runner.cpp
similarity index 98%
rename from examples/qualcomm/llama2/qaihub_runner/runner.cpp
rename to examples/qualcomm/qaihub_scripts/llama2/runner/runner.cpp
index 2f8a01f4e9..1162daf322 100644
--- a/examples/qualcomm/llama2/qaihub_runner/runner.cpp
+++ b/examples/qualcomm/qaihub_scripts/llama2/runner/runner.cpp
@@ -9,20 +9,19 @@
 // A simple llama2 runner that includes preprocessing and post processing logic.
 // The module takes in a string as input and emits a string as output.
 
-#include <executorch/examples/qualcomm/llama2/qaihub_runner/runner.h>
+#include <executorch/examples/qualcomm/qaihub_scripts/llama2/runner/runner.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/extension/llm/runner/util.h>
 #include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/platform/log.h>
 
 #include <ctime>
 #include <memory>
 #include <sstream>
 
-#include <executorch/examples/models/llama2/runner/util.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
-#include <executorch/runtime/platform/log.h>
-
 #if defined(__aarch64__)
 #include "arm_neon.h"
 #endif
diff --git a/examples/qualcomm/llama2/qaihub_runner/runner.h b/examples/qualcomm/qaihub_scripts/llama2/runner/runner.h
similarity index 97%
rename from examples/qualcomm/llama2/qaihub_runner/runner.h
rename to examples/qualcomm/qaihub_scripts/llama2/runner/runner.h
index 355616fa8f..44522b5152 100644
--- a/examples/qualcomm/llama2/qaihub_runner/runner.h
+++ b/examples/qualcomm/qaihub_scripts/llama2/runner/runner.h
@@ -17,7 +17,7 @@
 #include <string>
 #include <unordered_map>
 
-#include <executorch/examples/qualcomm/llama2/qaihub_runner/io_memory.h>
+#include <executorch/examples/qualcomm/qaihub_scripts/llama2/runner/io_memory.h>
 #include <executorch/extension/llm/sampler/sampler.h>
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
diff --git a/examples/qualcomm/scripts/deeplab_v3.py b/examples/qualcomm/scripts/deeplab_v3.py
index d870380e35..8d1aa376e7 100755
--- a/examples/qualcomm/scripts/deeplab_v3.py
+++ b/examples/qualcomm/scripts/deeplab_v3.py
@@ -15,7 +15,7 @@
 
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 from executorch.examples.models.deeplab_v3 import DeepLabV3ResNet101Model
-from executorch.examples.qualcomm.scripts.utils import (
+from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
     make_output_dir,
     parse_skip_delegation_node,
diff --git a/examples/qualcomm/scripts/edsr.py b/examples/qualcomm/scripts/edsr.py
index f602ecc1af..c5e3f8b010 100755
--- a/examples/qualcomm/scripts/edsr.py
+++ b/examples/qualcomm/scripts/edsr.py
@@ -15,7 +15,7 @@
 import torch
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 from executorch.examples.models.edsr import EdsrModel
-from executorch.examples.qualcomm.scripts.utils import (
+from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
     make_output_dir,
     parse_skip_delegation_node,
diff --git a/examples/qualcomm/scripts/inception_v3.py b/examples/qualcomm/scripts/inception_v3.py
index 90eb8cf206..50361938e8 100755
--- a/examples/qualcomm/scripts/inception_v3.py
+++ b/examples/qualcomm/scripts/inception_v3.py
@@ -14,7 +14,7 @@
 import torch
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 from executorch.examples.models.inception_v3.model import InceptionV3Model
-from executorch.examples.qualcomm.scripts.utils import (
+from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
     make_output_dir,
     parse_skip_delegation_node,
diff --git a/examples/qualcomm/scripts/inception_v4.py b/examples/qualcomm/scripts/inception_v4.py
index 84b20e6e20..cd4dcb7cd9 100755
--- a/examples/qualcomm/scripts/inception_v4.py
+++ b/examples/qualcomm/scripts/inception_v4.py
@@ -14,7 +14,7 @@
 import torch
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 from executorch.examples.models.inception_v4 import InceptionV4Model
-from executorch.examples.qualcomm.scripts.utils import (
+from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
     make_output_dir,
     parse_skip_delegation_node,
diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py
index 8972ca202f..94f528dbc3 100755
--- a/examples/qualcomm/scripts/mobilebert_fine_tune.py
+++ b/examples/qualcomm/scripts/mobilebert_fine_tune.py
@@ -13,7 +13,7 @@
 
 import torch
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
-from executorch.examples.qualcomm.scripts.utils import (
+from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
     make_output_dir,
     parse_skip_delegation_node,
diff --git a/examples/qualcomm/scripts/mobilenet_v2.py b/examples/qualcomm/scripts/mobilenet_v2.py
index 3ebdcd5d05..8a3032df02 100755
--- a/examples/qualcomm/scripts/mobilenet_v2.py
+++ b/examples/qualcomm/scripts/mobilenet_v2.py
@@ -14,7 +14,7 @@
 import torch
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 from executorch.examples.models.mobilenet_v2 import MV2Model
-from executorch.examples.qualcomm.scripts.utils import (
+from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
     make_output_dir,
     parse_skip_delegation_node,
diff --git a/examples/qualcomm/scripts/mobilenet_v3.py b/examples/qualcomm/scripts/mobilenet_v3.py
index 18fd7c849a..d0cd7bb4df 100644
--- a/examples/qualcomm/scripts/mobilenet_v3.py
+++ b/examples/qualcomm/scripts/mobilenet_v3.py
@@ -13,7 +13,7 @@
 
 import torch
 from executorch.examples.models.mobilenet_v3 import MV3Model
-from executorch.examples.qualcomm.scripts.utils import (
+from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
     make_output_dir,
     parse_skip_delegation_node,
diff --git a/examples/qualcomm/scripts/torchvision_vit.py b/examples/qualcomm/scripts/torchvision_vit.py
index cfdbe5d075..85852ebb2f 100755
--- a/examples/qualcomm/scripts/torchvision_vit.py
+++ b/examples/qualcomm/scripts/torchvision_vit.py
@@ -13,7 +13,7 @@
 import torch
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 from executorch.examples.models.torchvision_vit.model import TorchVisionViTModel
-from executorch.examples.qualcomm.scripts.utils import (
+from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
     make_output_dir,
     setup_common_args_and_variables,
diff --git a/examples/qualcomm/scripts/utils.py b/examples/qualcomm/utils.py
similarity index 98%
rename from examples/qualcomm/scripts/utils.py
rename to examples/qualcomm/utils.py
index 8211dc4581..2293b31e59 100755
--- a/examples/qualcomm/scripts/utils.py
+++ b/examples/qualcomm/utils.py
@@ -50,7 +50,7 @@ def __init__(
         host_id=None,
         error_only=False,
         shared_buffer=False,
-        runner="examples/qualcomm/qnn_executor_runner",
+        runner="examples/qualcomm/executor_runner/qnn_executor_runner",
     ):
         self.qnn_sdk = qnn_sdk
         self.build_path = build_path
@@ -105,13 +105,14 @@ def push(self, inputs=None, input_list=None, files=None):
             f"{self.build_path}/{self.runner}",
             f"{self.build_path}/backends/qualcomm/libqnn_executorch_backend.so",
         ]
-
         input_list_file, input_files = generate_inputs(
             self.working_dir, self.input_list_filename, inputs, input_list
         )
 
-        # prepare input list
-        artifacts.append(input_list_file)
+        if input_list_file is not None:
+            # prepare input list
+            artifacts.append(input_list_file)
+
         for artifact in artifacts:
             self._adb(["push", artifact, self.workspace])
 
@@ -336,7 +337,7 @@ def setup_common_args_and_variables():
     parser.add_argument(
         "-b",
         "--build_folder",
-        help="path to cmake binary directory for android, e.g., /path/to/build_android",
+        help="path to cmake binary directory for android, e.g., /path/to/cmake-out-android",
         type=str,
         required=True,
     )
@@ -434,7 +435,7 @@ def parse_skip_delegation_node(args):
 
 
 def generate_inputs(dest_path: str, file_name: str, inputs=None, input_list=None):
-    input_list_file = ""
+    input_list_file = None
     input_files = []
 
     # Prepare input list
diff --git a/examples/third-party/LLaVA b/examples/third-party/LLaVA
deleted file mode 160000
index 7440ec9ee3..0000000000
--- a/examples/third-party/LLaVA
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 7440ec9ee37b0374c6b5548818e89878e38f3353
diff --git a/exir/passes/replace_aten_with_edge_pass.py b/exir/passes/replace_aten_with_edge_pass.py
index 28732a55fb..bf949fc995 100644
--- a/exir/passes/replace_aten_with_edge_pass.py
+++ b/exir/passes/replace_aten_with_edge_pass.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import torch
 from executorch.exir.dialects._ops import ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
diff --git a/extension/aten_util/make_aten_functor_from_et_functor.h b/extension/aten_util/make_aten_functor_from_et_functor.h
index cacb63ad3d..aef008055b 100644
--- a/extension/aten_util/make_aten_functor_from_et_functor.h
+++ b/extension/aten_util/make_aten_functor_from_et_functor.h
@@ -20,8 +20,8 @@
 #endif
 #include <ATen/native/Resize.h>
 #include <executorch/extension/kernel_util/type_list.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
 #include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
 #include <torch/torch.h>
 
 namespace torch {
@@ -104,25 +104,39 @@ struct type_convert<
             typename remove_const_ref<ETensor>::type,
             torch::executor::Tensor>>>
     final {
- public:
-  ATensor val;
-  std::unique_ptr<ManagedTensor> managed_tensor;
-  torch::executor::Tensor converted;
-  std::vector<exec_aten::SizesType> sizes;
-  explicit type_convert(ATensor value)
-      : val(value), converted(torch::executor::Tensor(nullptr)) {
-    for (auto size : val.sizes()) {
-      sizes.push_back(size);
-    }
-    torch::executor::ScalarType scalar_type =
-        static_cast<torch::executor::ScalarType>(val.scalar_type());
-    managed_tensor = std::make_unique<ManagedTensor>(
-        val.mutable_data_ptr(), sizes, scalar_type);
-    converted = managed_tensor->get_aliasing_tensor();
+  explicit type_convert(ATensor value) : value_(value) {
+    auto sizes = std::make_shared<std::vector<Tensor::SizesType>>(
+        value_.sizes().begin(), value_.sizes().end());
+    const ssize_t dim = sizes->size();
+    auto dim_order = std::make_shared<std::vector<Tensor::DimOrderType>>(dim);
+    auto strides = std::make_shared<std::vector<Tensor::StridesType>>(dim);
+
+    std::iota(dim_order->begin(), dim_order->end(), 0);
+    dim_order_to_stride_nocheck(
+        sizes->data(), dim_order->data(), dim, strides->data());
+
+    auto tensor_impl = std::make_shared<TensorImpl>(
+        static_cast<torch::executor::ScalarType>(value_.scalar_type()),
+        sizes->size(),
+        sizes->data(),
+        value_.mutable_data_ptr(),
+        dim_order->data(),
+        strides->data());
+
+    converted_ = std::unique_ptr<Tensor, std::function<void(Tensor*)>>(
+        new Tensor(tensor_impl.get()),
+        [sizes, dim_order, strides, tensor_impl](Tensor* pointer) {
+          delete pointer;
+        });
   }
+
   ETensor call() {
-    return converted;
+    return *converted_;
   }
+
+ private:
+  ATensor value_;
+  std::unique_ptr<Tensor, std::function<void(Tensor*)>> converted_;
 };
 
 // Tensors: ETen to ATen.
@@ -136,21 +150,22 @@ struct type_convert<
             typename remove_const_ref<ETensor>::type,
             torch::executor::Tensor>>>
     final {
- public:
-  ETensor val;
-  at::Tensor converted;
-  std::vector<int64_t> sizes;
-  explicit type_convert(ETensor value) : val(value) {
-    for (auto size : val.sizes()) {
-      sizes.push_back(size);
-    }
-    c10::ScalarType scalar_type =
-        static_cast<c10::ScalarType>(val.scalar_type());
-    converted = at::from_blob(val.mutable_data_ptr(), sizes, scalar_type);
+  explicit type_convert(ETensor value)
+      : value_(value), sizes_(value_.sizes().begin(), value_.sizes().end()) {
+    converted_ = at::from_blob(
+        value_.mutable_data_ptr(),
+        sizes_,
+        static_cast<c10::ScalarType>(value_.scalar_type()));
   }
+
   ATensor call() {
-    return converted;
+    return converted_;
   }
+
+ private:
+  ETensor value_;
+  at::Tensor converted_;
+  std::vector<int64_t> sizes_;
 };
 
 // Optionals: ATen to ETen.
diff --git a/extension/aten_util/targets.bzl b/extension/aten_util/targets.bzl
index 6e32583029..b396cb7832 100644
--- a/extension/aten_util/targets.bzl
+++ b/extension/aten_util/targets.bzl
@@ -27,7 +27,6 @@ def define_common_targets():
         ],
         exported_deps = [
             "//executorch/extension/kernel_util:kernel_util",
-            "//executorch/extension/runner_util:managed_tensor",
             "//executorch/runtime/core:core",
             "//executorch/runtime/core:evalue",
             "//executorch/runtime/core/exec_aten:lib",
diff --git a/extension/data_loader/buffer_data_loader.h b/extension/data_loader/buffer_data_loader.h
index f92f888ddf..a5b66e24e7 100644
--- a/extension/data_loader/buffer_data_loader.h
+++ b/extension/data_loader/buffer_data_loader.h
@@ -12,6 +12,7 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/platform/log.h>
+#include <cstring>
 
 namespace executorch {
 namespace extension {
@@ -48,6 +49,24 @@ class BufferDataLoader : public executorch::runtime::DataLoader {
     return size_;
   }
 
+  __ET_NODISCARD executorch::runtime::Error load_into(
+      size_t offset,
+      size_t size,
+      __ET_UNUSED const SegmentInfo& segment_info,
+      void* buffer) override {
+    ET_CHECK_OR_RETURN_ERROR(
+        buffer != nullptr,
+        InvalidArgument,
+        "Destination buffer cannot be null");
+
+    auto result = load(offset, size, segment_info);
+    if (!result.ok()) {
+      return result.error();
+    }
+    std::memcpy(buffer, result->data(), size);
+    return executorch::runtime::Error::Ok;
+  }
+
  private:
   const uint8_t* const data_; // uint8 is easier to index into.
   const size_t size_;
diff --git a/extension/data_loader/file_data_loader.cpp b/extension/data_loader/file_data_loader.cpp
index e13b6c0468..578c94bdba 100644
--- a/extension/data_loader/file_data_loader.cpp
+++ b/extension/data_loader/file_data_loader.cpp
@@ -52,7 +52,6 @@ static uint8_t* align_pointer(void* ptr, size_t alignment) {
   addr = (addr | (alignment - 1)) + 1;
   return reinterpret_cast<uint8_t*>(addr);
 }
-
 } // namespace
 
 FileDataLoader::~FileDataLoader() {
@@ -146,19 +145,6 @@ Result<FreeableBuffer> FileDataLoader::load(
     return FreeableBuffer(nullptr, 0, /*free_fn=*/nullptr);
   }
 
-  // Seek to the right place in the file.
-  off_t seek_offset = ::lseek(fd_, offset, SEEK_SET);
-  if (seek_offset != offset) {
-    ET_LOG(
-        Error,
-        "Seeking %s to offset %zu returned %zd: %s",
-        file_name_,
-        offset,
-        (ssize_t)seek_offset,
-        strerror(errno));
-    return Error::AccessFailed;
-  }
-
   // Allocate memory for the FreeableBuffer.
   size_t alloc_size = size;
   if (alignment_ > alignof(std::max_align_t)) {
@@ -190,9 +176,75 @@ Result<FreeableBuffer> FileDataLoader::load(
       buffer,
       alloc_size);
 
+  auto err = load_into(offset, size, segment_info, aligned_buffer);
+  if (err != Error::Ok) {
+    // Free `buffer`, which is what malloc() gave us, not `aligned_buffer`.
+    std::free(buffer);
+    return err;
+  }
+
+  // We can't naively free this pointer, since it may not be what malloc() gave
+  // us. Pass the offset to the real buffer as context. This is the number of
+  // bytes that need to be subtracted from the FreeableBuffer::data() pointer to
+  // find the actual pointer to free.
+  return FreeableBuffer(
+      aligned_buffer,
+      size,
+      FreeSegment,
+      /*free_fn_context=*/
+      reinterpret_cast<void*>(
+          // Using signed types here because it will produce a signed ptrdiff_t
+          // value, though for us it will always be non-negative.
+          reinterpret_cast<intptr_t>(aligned_buffer) -
+          reinterpret_cast<intptr_t>(buffer)));
+}
+
+Result<size_t> FileDataLoader::size() const {
+  ET_CHECK_OR_RETURN_ERROR(
+      // Probably had its value moved to another instance.
+      fd_ >= 0,
+      InvalidState,
+      "Uninitialized");
+  return file_size_;
+}
+
+__ET_NODISCARD Error FileDataLoader::load_into(
+    size_t offset,
+    size_t size,
+    __ET_UNUSED const SegmentInfo& segment_info,
+    void* buffer) {
+  ET_CHECK_OR_RETURN_ERROR(
+      // Probably had its value moved to another instance.
+      fd_ >= 0,
+      InvalidState,
+      "Uninitialized");
+  ET_CHECK_OR_RETURN_ERROR(
+      offset + size <= file_size_,
+      InvalidArgument,
+      "File %s: offset %zu + size %zu > file_size_ %zu",
+      file_name_,
+      offset,
+      size,
+      file_size_);
+  ET_CHECK_OR_RETURN_ERROR(
+      buffer != nullptr, InvalidArgument, "Provided buffer cannot be null");
+
+  // Seek to the right place in the file.
+  off_t seek_offset = ::lseek(fd_, offset, SEEK_SET);
+  if (seek_offset != offset) {
+    ET_LOG(
+        Error,
+        "Seeking %s to offset %zu returned %zd: %s",
+        file_name_,
+        offset,
+        (ssize_t)seek_offset,
+        strerror(errno));
+    return Error::AccessFailed;
+  }
+
   // Read the data into the aligned address.
   size_t needed = size;
-  uint8_t* buf = reinterpret_cast<uint8_t*>(aligned_buffer);
+  uint8_t* buf = reinterpret_cast<uint8_t*>(buffer);
   while (needed > 0) {
     // Reads on macos will fail with EINVAL if size > INT32_MAX.
     ssize_t nread = ::read(
@@ -214,37 +266,12 @@ Result<FreeableBuffer> FileDataLoader::load(
           size,
           offset,
           nread == 0 ? "EOF" : strerror(errno));
-      // Free `buffer`, which is what malloc() gave us, not `aligned_buffer`.
-      std::free(buffer);
       return Error::AccessFailed;
     }
     needed -= nread;
     buf += nread;
   }
-
-  // We can't naively free this pointer, since it may not be what malloc() gave
-  // us. Pass the offset to the real buffer as context. This is the number of
-  // bytes that need to be subtracted from the FreeableBuffer::data() pointer to
-  // find the actual pointer to free.
-  return FreeableBuffer(
-      aligned_buffer,
-      size,
-      FreeSegment,
-      /*free_fn_context=*/
-      reinterpret_cast<void*>(
-          // Using signed types here because it will produce a signed ptrdiff_t
-          // value, though for us it will always be non-negative.
-          reinterpret_cast<intptr_t>(aligned_buffer) -
-          reinterpret_cast<intptr_t>(buffer)));
-}
-
-Result<size_t> FileDataLoader::size() const {
-  ET_CHECK_OR_RETURN_ERROR(
-      // Probably had its value moved to another instance.
-      fd_ >= 0,
-      InvalidState,
-      "Uninitialized");
-  return file_size_;
+  return Error::Ok;
 }
 
 } // namespace extension
diff --git a/extension/data_loader/file_data_loader.h b/extension/data_loader/file_data_loader.h
index df73647d12..93a54ac891 100644
--- a/extension/data_loader/file_data_loader.h
+++ b/extension/data_loader/file_data_loader.h
@@ -72,6 +72,12 @@ class FileDataLoader : public executorch::runtime::DataLoader {
 
   __ET_NODISCARD executorch::runtime::Result<size_t> size() const override;
 
+  __ET_NODISCARD executorch::runtime::Error load_into(
+      size_t offset,
+      size_t size,
+      __ET_UNUSED const SegmentInfo& segment_info,
+      void* buffer) override;
+
  private:
   FileDataLoader(
       int fd,
diff --git a/extension/data_loader/test/buffer_data_loader_test.cpp b/extension/data_loader/test/buffer_data_loader_test.cpp
index 91398a62a1..83d053ee46 100644
--- a/extension/data_loader/test/buffer_data_loader_test.cpp
+++ b/extension/data_loader/test/buffer_data_loader_test.cpp
@@ -133,3 +133,56 @@ TEST_F(BufferDataLoaderTest, OutOfBoundsLoadFails) {
     EXPECT_NE(fb.error(), Error::Ok);
   }
 }
+
+TEST_F(BufferDataLoaderTest, LoadIntoNullDstFails) {
+  // Wrap some data in a loader.
+  uint8_t data[256] = {};
+  BufferDataLoader edl(data, sizeof(data));
+
+  // Loading beyond the end of the data should fail.
+  {
+    Result<FreeableBuffer> fb = edl.load_into(
+        /*offset=*/0,
+        /*size=*/1,
+        /*segment_info=*/
+        DataLoader::SegmentInfo(DataLoader::SegmentInfo::Type::Program),
+        nullptr);
+    EXPECT_NE(fb.error(), Error::Ok);
+  }
+
+  // Loading zero bytes still fails if dst is null.
+  {
+    Result<FreeableBuffer> fb = edl.load_into(
+        /*offset=*/0,
+        /*size=*/0,
+        /*segment_info=*/
+        DataLoader::SegmentInfo(DataLoader::SegmentInfo::Type::Program),
+        nullptr);
+    EXPECT_NE(fb.error(), Error::Ok);
+  }
+}
+
+TEST_F(BufferDataLoaderTest, InBoundsLoadIntoSucceeds) {
+  // Wrap some data in a loader.
+  uint8_t data[256] = {};
+  data[0] = 1;
+  uint8_t buffer[256] = {};
+  buffer[0] = 0;
+  BufferDataLoader edl(data, sizeof(data));
+
+  {
+    // Buffer contains 0 before load_into.
+    EXPECT_EQ(buffer[0], 0);
+    Error fb = edl.load_into(
+        /*offset=*/0,
+        /*size=*/1,
+        /*segment_info=*/
+        DataLoader::SegmentInfo(DataLoader::SegmentInfo::Type::Program),
+        buffer);
+    EXPECT_EQ(fb, Error::Ok);
+    // Buffer contains 1 after load_into.
+    EXPECT_EQ(buffer[0], 1);
+    // Data is unaltered.
+    EXPECT_EQ(data[0], 1);
+  }
+}
diff --git a/extension/llm/custom_ops/op_tile_crop.cpp b/extension/llm/custom_ops/op_tile_crop.cpp
index 094bb9beec..7c596665d7 100644
--- a/extension/llm/custom_ops/op_tile_crop.cpp
+++ b/extension/llm/custom_ops/op_tile_crop.cpp
@@ -13,13 +13,95 @@
 namespace torch {
 namespace executor {
 namespace native {
+namespace {
+
+bool check_tile_crop_out_args(
+    const Tensor& in,
+    int64_t tile_size,
+    Tensor& out) {
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
+  ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(in, 3));
+  ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(out, 4));
+  ET_LOG_AND_RETURN_IF_FALSE(tile_size > 0);
+  ET_LOG_AND_RETURN_IF_FALSE(in.size(in.dim() - 1) % tile_size == 0);
+  ET_LOG_AND_RETURN_IF_FALSE(in.size(in.dim() - 2) % tile_size == 0);
+  return true;
+}
+
+void get_tile_crop_out_target_size(
+    const Tensor& in,
+    int64_t tile_size,
+    exec_aten::SizesType* out_sizes,
+    size_t* out_ndim) {
+  *out_ndim = in.dim() + 1;
+
+  out_sizes[0] = in.size(1) * in.size(2) / (tile_size * tile_size);
+  out_sizes[1] = in.size(0);
+  out_sizes[2] = tile_size;
+  out_sizes[3] = tile_size;
+}
+
+template <typename CTYPE>
+void tile_crop_impl(const Tensor& in, int64_t tile_size, Tensor& out) {
+  const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
+  CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
+
+  const auto channels = in.size(0);
+  const auto height = in.size(1);
+  const auto width = in.size(2);
+
+  const auto HdivS = height / tile_size;
+  const auto WdivS = width / tile_size;
+
+  size_t out_ix = 0;
+  for (size_t bH = 0; bH < HdivS; bH++) {
+    for (size_t bW = 0; bW < WdivS; bW++) {
+      for (size_t c = 0; c < channels; c++) {
+        for (size_t h = 0; h < tile_size; h++) {
+          for (size_t w = 0; w < tile_size; w++) {
+            size_t in_h = bH * tile_size + h;
+            size_t in_w = bW * tile_size + w;
+            size_t in_ix = c * height * width + in_h * width + in_w;
+
+            out_data[out_ix++] = in_data[in_ix];
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace
 
 Tensor& tile_crop_out_impl(
     RuntimeContext& ctx,
     const Tensor& input, // NOLINT
     const int64_t tile_size, // NOLINT
     Tensor& out) {
-  (void)ctx;
+  ET_KERNEL_CHECK(
+      ctx,
+      check_tile_crop_out_args(input, tile_size, out),
+      InvalidArgument,
+      out);
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  Tensor::SizesType expected_out_size[kTensorDimensionLimit];
+  size_t expected_out_dim = 0;
+  get_tile_crop_out_target_size(
+      input, tile_size, expected_out_size, &expected_out_dim);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  constexpr auto name = "tile_crop.out";
+
+  ET_SWITCH_ALL_TYPES(out.scalar_type(), ctx, name, CTYPE, [&]() {
+    tile_crop_impl<CTYPE>(input, tile_size, out);
+  });
+
   return out;
 }
 
diff --git a/extension/llm/custom_ops/op_tile_crop_test.cpp b/extension/llm/custom_ops/op_tile_crop_test.cpp
new file mode 100644
index 0000000000..565f510913
--- /dev/null
+++ b/extension/llm/custom_ops/op_tile_crop_test.cpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/llm/custom_ops/op_tile_crop.h>
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using torch::executor::testing::TensorFactory;
+
+class OpTileCropOutTest : public OperatorTest {
+ protected:
+  Tensor& op_tile_crop_out(const Tensor& self, int64_t tile_size, Tensor& out) {
+    return torch::executor::native::tile_crop_out_impl(
+        context_, self, tile_size, out);
+  }
+
+  template <ScalarType DTYPE_IN>
+  void test_tile_crop() {
+    TensorFactory<DTYPE_IN> tf_in;
+
+    const std::vector<int32_t> sizes = {1, 4, 4};
+    const std::vector<int32_t> out_sizes = {4, 1, 2, 2};
+
+    Tensor out = tf_in.zeros(out_sizes);
+
+    // clang-format off
+    op_tile_crop_out(
+        tf_in.make(
+            sizes, { 0,  1,  2,  3,
+                     4,  5,  6,  7,
+                     8,  9, 10, 11,
+                    12, 13, 14, 15}),
+        2,
+        out);
+    EXPECT_TENSOR_EQ(
+        out,
+        tf_in.make(
+            out_sizes, {0,  1,  4,  5,
+                        2,  3,  6,  7,
+                        8,  9, 12, 13,
+                       10, 11, 14, 15}));
+    // clang-format on
+  }
+};
+
+//
+// Correctness Tests
+//
+
+/**
+ * Uses the function templates above to test all input dtypes.
+ */
+TEST_F(OpTileCropOutTest, AllRealDtypesSupported){
+#define ENUMERATE_TEST_ENTRY(ctype, dtype) test_tile_crop<ScalarType::dtype>();
+    ET_FORALL_REAL_TYPES(ENUMERATE_TEST_ENTRY)
+#undef ENUMERATE_TEST_ENTRY
+}
+
+// Mismatched shape tests.
+TEST_F(OpTileCropOutTest, InvalidInputShapeDies) {
+  TensorFactory<ScalarType::Int> tf;
+
+  // Input tensors with invalid shapes. 7 is not divisible by tile_size
+  Tensor in = tf.ones(/*sizes=*/{1, 7, 8});
+  Tensor out = tf.zeros(/*sizes=*/{16, 1, 2, 2});
+
+  ET_EXPECT_KERNEL_FAILURE(context_, op_tile_crop_out(in, 2, out));
+}
+
+TEST_F(OpTileCropOutTest, WrongInputRankDies) {
+  TensorFactory<ScalarType::Int> tf;
+
+  // Tile crop requires a 3D input tensor.
+  Tensor in = tf.ones(/*sizes=*/{1, 2});
+  Tensor out = tf.zeros(/*sizes=*/{1, 2});
+
+  ET_EXPECT_KERNEL_FAILURE(context_, op_tile_crop_out(in, 2, out));
+}
+
+TEST_F(OpTileCropOutTest, DifferentDtypeDies) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Float> tf_float;
+
+  Tensor in = tf.ones(/*sizes=*/{2, 12, 12});
+
+  // Tile crop requires two tensors with the same dtype.
+  Tensor out = tf_float.zeros(/*sizes=*/{9, 2, 4, 4});
+
+  ET_EXPECT_KERNEL_FAILURE(context_, op_tile_crop_out(in, 3, out));
+}
+
+TEST_F(OpTileCropOutTest, NegativeTileSizeDies) {
+  TensorFactory<ScalarType::Int> tf;
+  Tensor in = tf.ones(/*sizes=*/{2, 12, 12});
+  Tensor out = tf.zeros(/*sizes=*/{9, 2, 4, 4});
+  ET_EXPECT_KERNEL_FAILURE(context_, op_tile_crop_out(in, -3, out));
+}
diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl
index ee277515b1..fe93f6a422 100644
--- a/extension/llm/custom_ops/targets.bzl
+++ b/extension/llm/custom_ops/targets.bzl
@@ -119,3 +119,17 @@ def define_common_targets():
         link_whole = True,
         force_static = True,
     )
+
+    runtime.cxx_test(
+        name = "op_tile_crop_test",
+        srcs = [
+            "op_tile_crop_test.cpp",
+        ],
+        visibility = ["//executorch/..."],
+        deps = [
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+            "//executorch/kernels/test:test_util",
+            ":op_tile_crop",
+        ],
+    )
diff --git a/extension/module/metadata_util.h b/extension/llm/runner/metadata_util.h
similarity index 100%
rename from extension/module/metadata_util.h
rename to extension/llm/runner/metadata_util.h
diff --git a/extension/llm/runner/stats.h b/extension/llm/runner/stats.h
index 31dd5e71cf..f62be0940c 100644
--- a/extension/llm/runner/stats.h
+++ b/extension/llm/runner/stats.h
@@ -8,12 +8,12 @@
 
 // Runner stats for LLM
 #pragma once
+#include <executorch/extension/llm/runner/util.h>
+#include <executorch/runtime/platform/log.h>
 #include <cinttypes>
 #include <sstream>
 // patternlint-disable-next-line executorch-cpp-nostdinc
 #include <string>
-
-#include <executorch/runtime/platform/log.h>
 namespace executorch::llm {
 
 struct Stats {
@@ -40,6 +40,18 @@ struct Stats {
   int64_t num_prompt_tokens;
   // Token count from generated (total - prompt)
   int64_t num_generated_tokens;
+  inline void on_sampling_begin() {
+    aggregate_sampling_timer_start_timestamp =
+        ::torch::executor::util::time_in_ms();
+  }
+  inline void on_sampling_end() {
+    aggregate_sampling_time_ms += ::torch::executor::util::time_in_ms() -
+        aggregate_sampling_timer_start_timestamp;
+    aggregate_sampling_timer_start_timestamp = 0;
+  }
+
+ private:
+  long aggregate_sampling_timer_start_timestamp = 0;
 };
 
 static constexpr auto kTopp = 0.9f;
diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl
index 81a3d32ba8..0f98926ec2 100644
--- a/extension/llm/runner/targets.bzl
+++ b/extension/llm/runner/targets.bzl
@@ -3,8 +3,69 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 def define_common_targets():
     runtime.cxx_library(
         name = "stats",
-        exported_headers = ["stats.h"],
+        exported_headers = [
+            "stats.h",
+            "util.h",
+        ],
         visibility = [
             "@EXECUTORCH_CLIENTS",
         ],
     )
+
+    for aten in (True, False):
+        aten_suffix = "_aten" if aten else ""
+
+        runtime.cxx_library(
+            name = "text_decoder_runner" + aten_suffix,
+            exported_headers = ["text_decoder_runner.h"],
+            srcs = ["text_decoder_runner.cpp"],
+            visibility = [
+                "@EXECUTORCH_CLIENTS",
+            ],
+            exported_deps = [
+                ":stats",
+                "//executorch/extension/llm/sampler:sampler" + aten_suffix,
+                "//executorch/extension/module:module" + aten_suffix,
+                "//executorch/extension/runner_util:managed_tensor" + aten_suffix,
+            ],
+        )
+
+        runtime.cxx_library(
+            name = "text_prefiller" + aten_suffix,
+            exported_headers = ["text_prefiller.h"],
+            srcs = ["text_prefiller.cpp"],
+            visibility = [
+                "@EXECUTORCH_CLIENTS",
+            ],
+            exported_deps = [
+                ":text_decoder_runner" + aten_suffix,
+                "//executorch/extension/llm/tokenizer:tokenizer_header",
+                "//executorch/extension/module:module" + aten_suffix,
+                "//executorch/extension/runner_util:managed_tensor" + aten_suffix,
+            ],
+        )
+
+        runtime.cxx_library(
+            name = "text_token_generator" + aten_suffix,
+            exported_headers = ["text_token_generator.h"],
+            visibility = [
+                "@EXECUTORCH_CLIENTS",
+            ],
+            exported_deps = [
+                ":text_decoder_runner" + aten_suffix,
+                "//executorch/extension/llm/tokenizer:tokenizer_header",
+                "//executorch/extension/module:module" + aten_suffix,
+                "//executorch/extension/runner_util:managed_tensor" + aten_suffix,
+            ],
+        )
+
+        runtime.cxx_library(
+            name = "metadata_util" + aten_suffix,
+            exported_headers = ["metadata_util.h"],
+            visibility = [
+                "@EXECUTORCH_CLIENTS",
+            ],
+            exported_deps = [
+                "//executorch/extension/module:module" + aten_suffix,
+            ],
+        )
diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp
new file mode 100644
index 0000000000..3de75ceccb
--- /dev/null
+++ b/extension/llm/runner/text_decoder_runner.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Given inputs, run a text decoder and return logits.
+
+#include <executorch/extension/llm/runner/stats.h>
+#include <executorch/extension/llm/runner/text_decoder_runner.h>
+#include <ctime>
+
+namespace torch::executor {
+
+// NOTE: we observed ~2x loading performance increase on iPhone 15
+// and a ~5% improvement on Galaxy S22 by switching to
+// FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
+TextDecoderRunner::TextDecoderRunner(
+    Module* module,
+    bool use_kv_cache,
+    int32_t vocab_size,
+    float temperature)
+    : module_(module),
+      sampler_(std::make_unique<Sampler>(
+          vocab_size,
+          temperature,
+          ::executorch::llm::kTopp,
+          static_cast<unsigned long long>(std::time(nullptr)))),
+      use_kv_cache_(use_kv_cache) {}
+
+// This function is functional, meaning it shouldn't modify any state of the
+// input. It should be safe to call multiple times with the same inputs. The
+// outer loop (call site) is responsible for managing state.
+Result<exec_aten::Tensor> TextDecoderRunner::step(
+    ManagedTensor& managed_tokens,
+    ManagedTensor& managed_start_pos) {
+  auto tokens = managed_tokens.get_aliasing_tensor();
+  // ET_LOG(Info, "Input token %" PRIu64, input_token);
+  if (use_kv_cache_) {
+    auto start_pos = managed_start_pos.get_aliasing_tensor();
+    Result<std::vector<EValue>> outputs_res =
+        module_->forward({tokens, start_pos});
+    ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
+    ET_CHECK_MSG(
+        outputs_res.get().size() == 1,
+        "More then one output returned from executing LLM.");
+    ET_CHECK_MSG(
+        outputs_res.get()[0].isTensor(),
+        "Non Tensor Output returned from executing LLM");
+
+    // Return the logits tensor
+    return outputs_res.get()[0].toTensor();
+  } else { // no kv cache
+    (void)managed_start_pos; // unused
+
+    Result<std::vector<EValue>> outputs_res = module_->forward({tokens});
+    ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
+    ET_CHECK_MSG(
+        outputs_res.get().size() == 1,
+        "More then one output returned from executing LLM.");
+    ET_CHECK_MSG(
+        outputs_res.get()[0].isTensor(),
+        "Non Tensor Output returned from executing LLM");
+
+    // Return the logits tensor
+    return outputs_res.get()[0].toTensor();
+  }
+}
+
+} // namespace torch::executor
diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h
new file mode 100644
index 0000000000..31b8c1b983
--- /dev/null
+++ b/extension/llm/runner/text_decoder_runner.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Given inputs, run a text decoder in LLM and return the output.
+
+#pragma once
+
+#include <executorch/extension/llm/sampler/sampler.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/runner_util/managed_tensor.h>
+// patternlint-disable-next-line executorch-cpp-nostdinc
+#include <functional>
+
+namespace torch::executor {
+
+class TextDecoderRunner {
+ public:
+  TextDecoderRunner(
+      Module* module,
+      bool use_kv_cache,
+      int32_t vocab_size,
+      float temperature);
+  /**
+   * Run LLM text decoder with inputs to generate next token.
+   * @param input The input to the LLM Module.
+   * @param start_pos The starting position in KV cache of the input in the LLM
+   * Module.
+   * @return The output of the LLM Module. This will be a tensor of logits.
+   */
+  Result<exec_aten::Tensor> step(
+      ManagedTensor& input,
+      ManagedTensor& start_pos);
+
+  /**
+   * Load the Module for a given method name.
+   * @param method_name The name of the method to load.
+   * @return The error code.
+   */
+  inline Error load(const std::string& method_name = "forward") {
+    return module_->load_method(method_name);
+  }
+
+  /**
+   * Check if the Module is loaded.
+   * @return True if the Module is loaded, false otherwise.
+   */
+  inline bool is_method_loaded(const std::string& method_name = "forward") {
+    return module_->is_method_loaded(method_name);
+  }
+
+  inline void stop() {
+    should_stop_ = true;
+  }
+
+  /**
+   * Sample the next token from the logits tensor.
+   * @param logits_tensor The logits tensor.
+   * @return The next token.
+   */
+  inline int32_t logits_to_token(const exec_aten::Tensor& logits_tensor) {
+    ET_CHECK_MSG(logits_tensor.dim() == 3, "Logits tensor must be 3D");
+    auto num_tokens = logits_tensor.size(1);
+    auto vocab_size = logits_tensor.size(2);
+
+    switch (logits_tensor.scalar_type()) {
+      case ScalarType::Float: {
+        float* logits = logits_tensor.mutable_data_ptr<float>();
+        float* logits_last = logits;
+        logits_last += (num_tokens - 1) * vocab_size;
+        return sampler_->sample(logits_last);
+      }
+      case ScalarType::Half: {
+        exec_aten::Half* logits =
+            logits_tensor.mutable_data_ptr<exec_aten::Half>();
+        exec_aten::Half* logits_last = logits;
+        logits_last += (num_tokens - 1) * vocab_size;
+        return sampler_->sample(logits_last);
+      }
+      default:
+        ET_CHECK_MSG(
+            false,
+            "Unsupported dtype output %hhd",
+            static_cast<int8_t>(logits_tensor.scalar_type()));
+    }
+  }
+
+ protected:
+  // TODO: use shared_ptr for module
+  Module* module_;
+  std::unique_ptr<Sampler> sampler_;
+  bool use_kv_cache_;
+  bool should_stop_{false};
+};
+
+} // namespace torch::executor
diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp
new file mode 100644
index 0000000000..a5aa668e73
--- /dev/null
+++ b/extension/llm/runner/text_prefiller.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Given a text prompt, encode it using tokenizer and prefill the KV cache of a
+// LLM.
+
+#include <executorch/extension/llm/runner/text_prefiller.h>
+
+namespace torch::executor {
+
+TextPrefiller::TextPrefiller(
+    Tokenizer* tokenizer,
+    TextDecoderRunner* text_decoder_runner,
+    bool use_kv_cache,
+    bool enable_parallel_prefill)
+    : tokenizer_(tokenizer),
+      text_decoder_runner_(text_decoder_runner),
+      use_kv_cache_(use_kv_cache),
+      enable_parallel_prefill_(enable_parallel_prefill) {}
+
+Result<uint64_t> TextPrefiller::prefill(
+    std::vector<uint64_t>& prompt_tokens,
+    int64_t start_pos,
+    std::function<void(const std::string&)> token_callback) {
+  ET_CHECK_MSG(!prompt_tokens.empty(), "Prompt cannot be null");
+  if (!text_decoder_runner_->is_method_loaded()) {
+    ET_CHECK_OK_OR_RETURN_ERROR(text_decoder_runner_->load());
+  }
+  // enable_parallel_prefill_ maybe set even when not using kv cache
+  // When kv cache is not used, start pos is ignored
+  int32_t num_prompt_tokens = prompt_tokens.size();
+
+  // store the token
+  uint64_t cur_token;
+  if (enable_parallel_prefill_ || !use_kv_cache_) {
+    // initialize tensor wrappers
+    ManagedTensor managed_tokens(
+        prompt_tokens.data(), {1, num_prompt_tokens}, ScalarType::Long);
+
+    ManagedTensor managed_start_pos(&start_pos, {1}, ScalarType::Long);
+
+    Result<exec_aten::Tensor> outputs_res =
+        text_decoder_runner_->step(managed_tokens, managed_start_pos);
+
+    ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
+    ET_LOG(
+        Info, "Prefill token result numel(): %zu", outputs_res.get().numel());
+    ET_CHECK_MSG(
+        outputs_res.get().size(1) == num_prompt_tokens,
+        "Expected number of output tokens %d does not match returned value %zu.",
+        num_prompt_tokens,
+        outputs_res.get().size(1));
+    // insert new token into prompt_tokens
+    // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds)
+    uint64_t prev = prompt_tokens[0];
+    uint64_t cur;
+    for (int i = 0; i < prompt_tokens.size(); i++) {
+      cur = prompt_tokens[i];
+      if (cur != tokenizer_->bos_tok()) {
+        token_callback(ET_UNWRAP(tokenizer_->decode(prev, cur)));
+      }
+      prev = cur;
+    }
+    cur_token = text_decoder_runner_->logits_to_token(outputs_res.get());
+  } else { // sequential prefill
+    int64_t pos = 0; // position in the sequence
+    int64_t prev_token;
+    // token & pos
+    int64_t pos_data = 0;
+    // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds)
+    cur_token = prompt_tokens[0];
+
+    // initialize tensor wrappers
+    ManagedTensor managed_tokens(&cur_token, {1, 1}, ScalarType::Long);
+
+    ManagedTensor managed_start_pos(&pos_data, {1}, ScalarType::Long);
+
+    // run the first token and get back logits tensor. Assuming the first token
+    // is bos so don't callback.
+    exec_aten::Tensor logits_tensor = ET_UNWRAP(
+        text_decoder_runner_->step(managed_tokens, managed_start_pos));
+
+    // if first token is not bos, we need to callback
+    if (cur_token != tokenizer_->bos_tok()) {
+      token_callback(ET_UNWRAP(tokenizer_->decode(cur_token, cur_token)));
+    }
+    pos = 1; // start from index 1
+
+    while (pos < num_prompt_tokens) {
+      // Run the model
+      pos_data = start_pos + pos;
+
+      prev_token = cur_token;
+
+      // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds)
+      cur_token = prompt_tokens[pos];
+
+      logits_tensor = ET_UNWRAP(
+          text_decoder_runner_->step(managed_tokens, managed_start_pos));
+
+      // print the token as string, decode it with the Tokenizer object
+      token_callback(ET_UNWRAP(tokenizer_->decode(prev_token, cur_token)));
+
+      pos++;
+    }
+
+    cur_token = text_decoder_runner_->logits_to_token(logits_tensor);
+  }
+  return cur_token;
+}
+
+} // namespace torch::executor
diff --git a/extension/llm/runner/text_prefiller.h b/extension/llm/runner/text_prefiller.h
new file mode 100644
index 0000000000..7293fdca2a
--- /dev/null
+++ b/extension/llm/runner/text_prefiller.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Given a text prompt, encode it using tokenizer and prefill the KV cache of a
+// LLM.
+
+#pragma once
+
+#include <executorch/extension/llm/runner/text_decoder_runner.h>
+#include <executorch/extension/llm/tokenizer/tokenizer.h>
+// patternlint-disable-next-line executorch-cpp-nostdinc
+#include <functional>
+
+namespace torch::executor {
+
+class TextPrefiller {
+ public:
+  TextPrefiller(
+      Tokenizer* tokenizer,
+      TextDecoderRunner* text_decoder_runner,
+      bool use_kv_cache_,
+      bool enable_parallel_prefill);
+  /**
+   * Prefill an LLM Module with the given text input.
+   * @param prompt_tokens The text prompt tokens to the LLM Module. Encoded by
+   * tokenizer.
+   * @param start_pos The starting position in KV cache of the input in the LLM
+   * Module.
+   * @param token_callback A callback function that will be called for each
+   * token in the prompt.
+   * @return The next token of the LLM Module after prefill.
+   */
+  Result<uint64_t> prefill(
+      std::vector<uint64_t>& prompt_tokens,
+      int64_t start_pos = 0,
+      std::function<void(const std::string&)> token_callback = {});
+
+ private:
+  Tokenizer* tokenizer_;
+  TextDecoderRunner* text_decoder_runner_;
+  bool use_kv_cache_;
+  bool enable_parallel_prefill_;
+};
+
+} // namespace torch::executor
diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h
new file mode 100644
index 0000000000..641793eb0e
--- /dev/null
+++ b/extension/llm/runner/text_token_generator.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Generate tokens in a loop.
+#pragma once
+
+#include <executorch/extension/llm/runner/stats.h>
+#include <executorch/extension/llm/runner/text_decoder_runner.h>
+#include <executorch/extension/llm/tokenizer/tokenizer.h>
+
+namespace torch::executor {
+using Stats = ::executorch::llm::Stats;
+
+class TextTokenGenerator {
+ public:
+  TextTokenGenerator(
+      Tokenizer* tokenizer,
+      TextDecoderRunner* text_decoder_runner,
+      bool use_kv_cache,
+      uint64_t eos_id,
+      Stats* stats)
+      : tokenizer_(tokenizer),
+        text_decoder_runner_(text_decoder_runner),
+        eos_id_(eos_id),
+        use_kv_cache_(use_kv_cache),
+        stats_(stats) {}
+
+  /**
+   * Token generation loop.
+   * @param tokens prompt tokens as well as the first token generated by
+   * prefill.
+   * @param start_pos the start position of the new tokens, based on how many
+   * prompt tokens is prefilled.
+   * @param seq_len the total sequence length, including the prompt tokens, next
+   * token from prefill and new tokens.
+   * @param token_callback what to do after a token is generated.
+   * @return how many tokens are generated.
+   */
+  inline Result<int64_t> generate(
+      std::vector<uint64_t> tokens,
+      int64_t start_pos,
+      int32_t seq_len,
+      std::function<void(const std::string&)> token_callback) {
+    ET_CHECK_MSG(
+        !tokens.empty(), "Token generation loop shouldn't take empty tokens");
+    int64_t pos = start_pos; // position in the sequence
+
+    std::vector<uint64_t> token_data; // allocate space for the tokens
+    std::vector<exec_aten::SizesType> token_shape;
+
+    // Token after prefill
+    uint64_t cur_token = tokens.back();
+    uint64_t prev_token;
+
+    if (use_kv_cache_) {
+      // hard code these to size 1 as kv cache is locked to static size right
+      // now.
+      token_data = {cur_token};
+      token_shape = {1, 1};
+    } else {
+      token_data = tokens;
+      token_shape = {1, static_cast<int>(tokens.size())};
+    }
+
+    // initialize tensor wrappers
+    ManagedTensor tokens_managed(
+        token_data.data(), token_shape, ScalarType::Long);
+
+    ManagedTensor start_pos_managed(&pos, {1}, ScalarType::Long);
+
+    // Generate our tokens
+    while (pos < seq_len - 1) {
+      // Run the model
+      Result<exec_aten::Tensor> logits_res =
+          text_decoder_runner_->step(tokens_managed, start_pos_managed);
+
+      ET_CHECK_OK_OR_RETURN_ERROR(logits_res.error());
+      exec_aten::Tensor& logits_tensor = logits_res.get();
+
+      prev_token = cur_token;
+
+      stats_->on_sampling_begin();
+      cur_token = text_decoder_runner_->logits_to_token(logits_tensor);
+      stats_->on_sampling_end();
+
+      pos++;
+
+      if (use_kv_cache_) {
+        // update the token tensor. token_data will not be empty.
+        // NOLINTNEXTLINE(facebook-hte-LocalUncheckedArrayBounds)
+        token_data[0] = cur_token;
+      } else {
+        // push it to the back
+        token_data.push_back(cur_token);
+        tokens_managed.resize({1, static_cast<int>(token_data.size())});
+      }
+
+      // print the token as string, decode it with the Tokenizer object
+      token_callback(ET_UNWRAP(tokenizer_->decode(prev_token, cur_token)));
+
+      if (should_stop_) {
+        break;
+      }
+
+      // data-dependent terminating condition: we have n_eos_ number of EOS
+      if (cur_token == eos_id_) {
+        printf("\n");
+        ET_LOG(Info, "\nReached to the end of generation");
+        break;
+      }
+    }
+    return pos - start_pos;
+  }
+
+  /**
+   * Stop the generation loop.
+   */
+  inline void stop() {
+    should_stop_ = true;
+  }
+
+ private:
+  Tokenizer* tokenizer_;
+  TextDecoderRunner* text_decoder_runner_;
+  uint64_t eos_id_;
+  bool use_kv_cache_;
+
+  // state machine
+  bool should_stop_ = false;
+
+  // stats
+  Stats* stats_;
+};
+} // namespace torch::executor
diff --git a/examples/models/llama2/runner/util.h b/extension/llm/runner/util.h
similarity index 100%
rename from examples/models/llama2/runner/util.h
rename to extension/llm/runner/util.h
diff --git a/extension/llm/sampler/sampler.cpp b/extension/llm/sampler/sampler.cpp
index be3307b715..6b0f155f12 100644
--- a/extension/llm/sampler/sampler.cpp
+++ b/extension/llm/sampler/sampler.cpp
@@ -33,6 +33,7 @@
  */
 
 #include <executorch/extension/llm/sampler/sampler.h>
+#include <algorithm>
 
 namespace torch {
 namespace executor {
@@ -66,18 +67,6 @@ int32_t Sampler::sample_mult(T* probabilities, float coin) {
   return vocab_size_ - 1; // in case of rounding errors
 }
 
-template <typename T>
-static int32_t compare(const void* a, const void* b) {
-  ProbIndex<T>* a_ = (ProbIndex<T>*)a;
-  ProbIndex<T>* b_ = (ProbIndex<T>*)b;
-  if (a_->prob > b_->prob) {
-    return -1;
-  } else if (a_->prob < b_->prob) {
-    return 1;
-  }
-  return 0;
-}
-
 template <typename T>
 int32_t Sampler::sample_topp(T* probabilities, float coin) {
   // top-p sampling (or "nucleus sampling") samples from the smallest set of
@@ -100,7 +89,11 @@ int32_t Sampler::sample_topp(T* probabilities, float coin) {
       n0++;
     }
   }
-  qsort(probindex.get(), n0, sizeof(ProbIndex<T>), compare<T>);
+
+  auto compare = [](const ProbIndex<T>& a, const ProbIndex<T>& b) {
+    return a.prob > b.prob;
+  };
+  std::sort(probindex.get(), probindex.get() + n0, compare);
 
   // truncate the list where cumulative probability exceeds topp
   T cumulative_prob = 0;
@@ -131,7 +124,7 @@ Sampler::Sampler(
     float topp,
     unsigned long long rng_seed)
     : vocab_size_(vocab_size),
-      temperature_(temperature),
+      inv_temperature_(static_cast<bool>(temperature) ? 1.0f / temperature : 0),
       topp_(topp),
       rng_state_(rng_seed) {}
 
@@ -172,13 +165,13 @@ template <typename T>
 int32_t Sampler::sample(T* logits) {
   // sample the token given the logits and some hyperparameters
   int next;
-  if (temperature_ == 0.0f) {
+  if (inv_temperature_ == 0.0f) {
     // greedy argmax sampling: take the token with the highest probability
     next = sample_argmax(logits);
   } else {
     // apply the temperature to the logits
     for (int q = 0; q < vocab_size_; q++) {
-      logits[q] /= temperature_;
+      logits[q] *= inv_temperature_;
     }
     // apply softmax to the logits to get the probabilities for next token
     softmax(logits, vocab_size_);
diff --git a/extension/llm/sampler/sampler.h b/extension/llm/sampler/sampler.h
index b73a9859ed..584a010bba 100644
--- a/extension/llm/sampler/sampler.h
+++ b/extension/llm/sampler/sampler.h
@@ -51,7 +51,8 @@ class Sampler {
 
  private:
   int32_t vocab_size_;
-  float temperature_;
+  // reciprocal of temperature, or 0 if temperature == 0.
+  float inv_temperature_;
   float topp_;
   unsigned long long rng_state_;
 };
diff --git a/extension/llm/tokenizer/targets.bzl b/extension/llm/tokenizer/targets.bzl
index 8229bced89..f8e4df095c 100644
--- a/extension/llm/tokenizer/targets.bzl
+++ b/extension/llm/tokenizer/targets.bzl
@@ -59,16 +59,29 @@ def define_common_targets():
         ],
     )
 
+    runtime.cxx_library(
+        name = "tokenizer_header",
+        exported_headers = [
+            "tokenizer.h",
+        ],
+        exported_deps = [
+            "//executorch/runtime/core:core",
+        ],
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
     runtime.cxx_library(
         name = "bpe_tokenizer",
         srcs = [
             "bpe_tokenizer.cpp",
         ],
         exported_headers = [
-            "tokenizer.h",
             "bpe_tokenizer.h",
         ],
         exported_deps = [
+            ":tokenizer_header",
             "//executorch/runtime/core:core",
         ],
         visibility = [
@@ -82,11 +95,11 @@ def define_common_targets():
             "tiktoken.cpp",
         ],
         exported_headers = [
-            "tokenizer.h",
             "tiktoken.h",
             "base64.h",
         ],
         exported_deps = [
+            ":tokenizer_header",
             "//executorch/runtime/core:core",
         ],
         visibility = [
diff --git a/extension/module/module.h b/extension/module/module.h
index c8093ecdd2..4e81735c04 100644
--- a/extension/module/module.h
+++ b/extension/module/module.h
@@ -158,6 +158,41 @@ class Module final {
     return execute(method_name, {});
   }
 
+  /**
+   * Retrieve the output value of a specific method with the given input.
+   * Loads the program and method before execution if needed.
+   *
+   * @param[in] method_name The name of the method to execute.
+   * @param[in] input A vector of input values to be passed to the method.
+   *
+   * @returns A Result object containing either the first output value from the
+   * method or an error to indicate failure.
+   */
+  __ET_NODISCARD
+  Result<EValue> get(
+      const std::string& method_name,
+      const std::vector<EValue>& input) {
+    auto result = ET_UNWRAP(execute(method_name, input));
+    if (result.empty()) {
+      return Error::InvalidArgument;
+    }
+    return result[0];
+  }
+
+  /**
+   * Retrieve the output value of a specific method without any input values.
+   * Loads the program and method before execution if needed.
+   *
+   * @param[in] method_name The name of the method to execute.
+   *
+   * @returns A Result object containing either the first output value from the
+   * method or an error to indicate failure.
+   */
+  __ET_NODISCARD
+  Result<EValue> get(const std::string& method_name) {
+    return get(method_name, {});
+  }
+
   /**
    * Execute the 'forward' method with the given input and retrieve output.
    * Loads the program and method before executing if needed.
diff --git a/extension/module/targets.bzl b/extension/module/targets.bzl
index 07020b03a8..61251047dc 100644
--- a/extension/module/targets.bzl
+++ b/extension/module/targets.bzl
@@ -17,7 +17,6 @@ def define_common_targets():
             ],
             exported_headers = [
                 "module.h",
-                "metadata_util.h",
             ],
             visibility = [
                 "@EXECUTORCH_CLIENTS",
diff --git a/extension/module/test/module_test.cpp b/extension/module/test/module_test.cpp
index 6c13c43cb9..d549117f33 100644
--- a/extension/module/test/module_test.cpp
+++ b/extension/module/test/module_test.cpp
@@ -202,6 +202,21 @@ TEST_F(ModuleTest, TestExecuteOnCurrupted) {
   EXPECT_FALSE(result.ok());
 }
 
+TEST_F(ModuleTest, TestGet) {
+  Module module(std::getenv("RESOURCES_PATH") + std::string("/model.pte"));
+
+  std::array<float, 2> input{1, 2};
+  std::array<int32_t, 2> sizes{1, 2};
+  TensorImpl tensor(
+      ScalarType::Float, sizes.size(), sizes.data(), input.data());
+
+  const auto result = module.get("forward", {EValue(Tensor(&tensor))});
+
+  EXPECT_TRUE(result.ok());
+  const auto data = result->toTensor().const_data_ptr<float>();
+  EXPECT_NEAR(data[0], 1.5, 1e-5);
+}
+
 TEST_F(ModuleTest, TestForward) {
   auto module = std::make_unique<Module>(
       std::getenv("RESOURCES_PATH") + std::string("/model.pte"));
diff --git a/extension/pybindings/TARGETS b/extension/pybindings/TARGETS
index 3111ef6fa7..ecf23e4658 100644
--- a/extension/pybindings/TARGETS
+++ b/extension/pybindings/TARGETS
@@ -30,9 +30,10 @@ runtime.genrule(
     srcs = [":pybinding_types"],
     outs = {
         "aten_lib.pyi": ["aten_lib.pyi"],
+        "core.pyi": ["core.pyi"],
         "_portable_lib.pyi": ["_portable_lib.pyi"],
     },
-    cmd = "cp $(location :pybinding_types)/* $OUT/_portable_lib.pyi && cp $(location :pybinding_types)/* $OUT/aten_lib.pyi",
+    cmd = "cp $(location :pybinding_types)/* $OUT/_portable_lib.pyi && cp $(location :pybinding_types)/* $OUT/aten_lib.pyi && cp $(location :pybinding_types)/* $OUT/core.pyi",
     visibility = ["//executorch/extension/pybindings/..."],
 )
 
@@ -40,6 +41,7 @@ executorch_pybindings(
     compiler_flags = ["-std=c++17"],
     cppdeps = PORTABLE_MODULE_DEPS,
     python_module_name = "core",
+    types = ["//executorch/extension/pybindings:pybindings_types_gen[core.pyi]"],
     visibility = ["PUBLIC"],
 )
 
diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp
index 1f6d1075a1..0dba760329 100644
--- a/extension/pybindings/pybindings.cpp
+++ b/extension/pybindings/pybindings.cpp
@@ -95,6 +95,34 @@ void write_data_to_file(const std::string& path, void* buf, size_t size) {
   }
 }
 
+void setup_output_storage(
+    executor::Method& method,
+    const std::vector<Span<uint8_t>>& output_storages) {
+  if (output_storages.size() != method.outputs_size()) {
+    THROW_IF_ERROR(
+        Error(),
+        "number of output storages %zu does not match number of outputs %zu",
+        output_storages.size(),
+        method.outputs_size());
+  }
+  for (size_t i = 0; i < output_storages.size(); ++i) {
+    if (output_storages[i].size() == 0) {
+      // Skip empty output storages, this would happen for non-tensor outputs.
+      continue;
+    }
+    Error output_status = method.set_output_data_ptr(
+        output_storages[i].data(), output_storages[i].size(), i);
+    // InvalidState can be the status if outputs are already memory planned.
+    // That's fine and we don't need to alert the user to that error.
+    if (output_status != Error::Ok && output_status != Error::InvalidState) {
+      ET_LOG(
+          Error,
+          "Cannot set_output_data_ptr(): 0x%" PRIx32,
+          static_cast<uint32_t>(output_status));
+    }
+  }
+}
+
 using util::BufferDataLoader;
 using util::MallocMemoryAllocator;
 using util::MmapDataLoader;
@@ -209,26 +237,7 @@ class Module final {
         c10::autograd_dispatch_keyset);
 #endif
     if (output_storages) {
-      if (output_storages->size() != method->outputs_size()) {
-        THROW_IF_ERROR(
-            Error(),
-            "number of output storages %zu does not match number of outputs %zu",
-            output_storages->size(),
-            method->outputs_size());
-      }
-      for (size_t i = 0; i < output_storages->size(); ++i) {
-        Error output_status = method->set_output_data_ptr(
-            (*output_storages)[i].data(), (*output_storages)[i].size(), i);
-        // InvalidState can be the status if outputs are already memory planned.
-        // That's fine and we don't need to alert the user to that error.
-        if (output_status != Error::Ok &&
-            output_status != Error::InvalidState) {
-          ET_LOG(
-              Error,
-              "Cannot set_output_data_ptr(): 0x%" PRIx32,
-              static_cast<uint32_t>(output_status));
-        }
-      }
+      setup_output_storage(*method, *output_storages);
     }
     Error execute_status = method->execute();
     THROW_IF_ERROR(
@@ -236,6 +245,11 @@ class Module final {
         "method->execute() failed with error 0x%" PRIx32,
         static_cast<uint32_t>(execute_status));
     // process outputs
+    return get_outputs(method_name);
+  }
+
+  std::vector<EValue> get_outputs(const std::string& method_name) {
+    auto& method = methods_[method_name];
     std::vector<EValue> result(method->outputs_size());
 
     Error get_outputs_status =
@@ -556,62 +570,17 @@ struct PyModule final {
 
     const auto& method = module_->get_method(method_name);
     const auto num_outputs = method.outputs_size();
-    // These output storages will not be used if the ExecuTorch program already
-    // pre-allocated output space. That is represented by an error from
-    // set_output_data_ptr.
-    std::vector<std::unique_ptr<uint8_t[]>> output_storages(num_outputs);
+    output_storages_ = make_output_storages(method);
     std::vector<Span<uint8_t>> output_storage_spans(num_outputs);
-    for (size_t i = 0; i < num_outputs; ++i) {
-      const auto& output_tensor_meta =
-          method.method_meta().output_tensor_meta(i);
-      if (!output_tensor_meta.ok()) {
-        // If the output isn't a tensor it won't have a tensor meta.
-        ET_LOG(
-            Info,
-            "Tensor meta doesn't exist for output %zu, error is 0x%" PRIx32
-            ", skipping allocating storage",
-            i,
-            static_cast<uint32_t>(output_tensor_meta.error()));
-        output_storage_spans[i] = Span<uint8_t>();
-        continue;
-      }
-      const size_t output_size = output_tensor_meta.get().nbytes();
-      std::unique_ptr<uint8_t[]> output(new uint8_t[output_size]);
-      output_storage_spans[i] = Span<uint8_t>(output.get(), output_size);
-      output_storages[i] = std::move(output);
+    for (int i = 0; i < output_storages_.size(); ++i) {
+      output_storage_spans[i] =
+          Span<uint8_t>(output_storages_[i].data(), output_storages_[i].size());
     }
     auto outputs =
         module_->run_method(method_name, cpp_inputs, output_storage_spans);
 
     // Retrieve outputs
-    const auto outputs_size = outputs.size();
-    py::list list(outputs_size);
-    for (size_t i = 0; i < outputs_size; ++i) {
-      auto& v = outputs[i];
-      if (Tag::None == v.tag) {
-        list[i] = py::none();
-      } else if (Tag::Int == v.tag) {
-        list[i] = py::cast(v.toInt());
-      } else if (Tag::Double == v.tag) {
-        list[i] = py::cast(v.toDouble());
-      } else if (Tag::Bool == v.tag) {
-        list[i] = py::cast(v.toBool());
-      } else if (Tag::String == v.tag) {
-        list[i] = py::cast(std::string(v.toString().data()));
-      } else if (Tag::Tensor == v.tag) {
-#ifdef USE_ATEN_LIB
-        // Clone so the outputs in python do not share a lifetime with the
-        // module object
-        list[i] = py::cast(v.toTensor().clone());
-#else
-        list[i] = py::cast(
-            torch::util::alias_attensor_to_etensor(v.toTensor()).clone());
-#endif
-      } else {
-        ET_ASSERT_UNREACHABLE_MSG("Invalid model output type");
-      }
-    }
-    return list;
+    return get_outputs_as_py_list(outputs);
   }
 
   py::list forward(const py::sequence& inputs) {
@@ -670,35 +639,113 @@ struct PyModule final {
         static_cast<uint32_t>(status));
   }
 
-  void verify_result_with_bundled_expected_output(
+  py::list verify_result_with_bundled_expected_output(
       PyBundledModule& m,
       const string method_name,
       size_t testset_idx,
       double rtol = 1e-5,
       double atol = 1e-8) {
     const void* bundled_program_ptr = m.get_bundled_program_ptr();
-    Error status = bundled_program::VerifyResultWithBundledExpectedOutput(
-        module_->get_method(method_name),
-        bundled_program_ptr,
-        testset_idx,
-        rtol,
-        atol);
+    auto& method = module_->get_method(method_name);
+    Error status = bundled_program::LoadBundledInput(
+        method, bundled_program_ptr, testset_idx);
+    THROW_IF_ERROR(
+        status,
+        "LoadBundledInput failed with status %" PRIu32,
+        static_cast<uint32_t>(status));
+    py::list outputs = plan_execute(method_name);
+    status = bundled_program::VerifyResultWithBundledExpectedOutput(
+        method, bundled_program_ptr, testset_idx, rtol, atol);
     THROW_IF_ERROR(
         status,
         "Result verification failed with status %" PRIu32,
         static_cast<uint32_t>(status));
+    return outputs;
   }
 
-  void plan_execute(const string method_name) {
-    auto status = module_->get_method(method_name).execute();
+  py::list plan_execute(const string method_name) {
+    auto& method = module_->get_method(method_name);
+    // Need to pre-allocate space for outputs just like in run_method.
+    const auto num_outputs = method.outputs_size();
+    output_storages_ = make_output_storages(method);
+    std::vector<Span<uint8_t>> output_storage_spans(num_outputs);
+    for (int i = 0; i < output_storages_.size(); ++i) {
+      output_storage_spans[i] =
+          Span<uint8_t>(output_storages_[i].data(), output_storages_[i].size());
+    }
+    setup_output_storage(method, output_storage_spans);
+    auto status = method.execute();
     THROW_IF_ERROR(
         status,
         "executing execution plan for method 'forward' failed with error: 0x%" PRIx32,
         static_cast<uint32_t>(status));
+    const auto outputs = module_->get_outputs(method_name);
+    return get_outputs_as_py_list(outputs);
+  }
+
+  py::list get_outputs_as_py_list(const std::vector<EValue>& outputs) {
+    const auto outputs_size = outputs.size();
+    py::list list(outputs_size);
+    for (size_t i = 0; i < outputs_size; ++i) {
+      auto& v = outputs[i];
+      if (Tag::None == v.tag) {
+        list[i] = py::none();
+      } else if (Tag::Int == v.tag) {
+        list[i] = py::cast(v.toInt());
+      } else if (Tag::Double == v.tag) {
+        list[i] = py::cast(v.toDouble());
+      } else if (Tag::Bool == v.tag) {
+        list[i] = py::cast(v.toBool());
+      } else if (Tag::String == v.tag) {
+        list[i] = py::cast(std::string(v.toString().data()));
+      } else if (Tag::Tensor == v.tag) {
+#ifdef USE_ATEN_LIB
+        // Clone so the outputs in python do not share a lifetime with the
+        // module object
+        list[i] = py::cast(v.toTensor().clone());
+#else
+        list[i] = py::cast(
+            torch::util::alias_attensor_to_etensor(v.toTensor()).clone());
+#endif
+      } else {
+        ET_ASSERT_UNREACHABLE_MSG("Invalid model output type");
+      }
+    }
+    return list;
   }
 
  private:
   std::unique_ptr<Module> module_;
+  // Need to keep-alive output storages until they can be compared in case of
+  // bundled programs.
+  std::vector<std::vector<uint8_t>> output_storages_;
+
+  std::vector<std::vector<uint8_t>> make_output_storages(
+      const executor::Method& method) {
+    const auto num_outputs = method.outputs_size();
+    // These output storages will not be used if the ExecuTorch program already
+    // pre-allocated output space. That is represented by an error from
+    // set_output_data_ptr.
+    std::vector<std::vector<uint8_t>> output_storages(num_outputs);
+    for (size_t i = 0; i < num_outputs; ++i) {
+      const auto& output_tensor_meta =
+          method.method_meta().output_tensor_meta(i);
+      if (!output_tensor_meta.ok()) {
+        // If the output isn't a tensor it won't have a tensor meta.
+        ET_LOG(
+            Error,
+            "Tensor meta doesn't exist for output %zu, error is 0x%" PRIx32
+            ", skipping allocating storage",
+            i,
+            static_cast<uint32_t>(output_tensor_meta.error()));
+        output_storages[i] = std::vector<uint8_t>();
+        continue;
+      }
+      const size_t output_size = output_tensor_meta.get().nbytes();
+      output_storages[i] = std::vector<uint8_t>(output_size);
+    }
+    return output_storages;
+  }
 };
 
 void create_profile_block(const std::string& name) {
diff --git a/extension/pybindings/pybindings.pyi b/extension/pybindings/pybindings.pyi
index a085911672..e02ae0046f 100644
--- a/extension/pybindings/pybindings.pyi
+++ b/extension/pybindings/pybindings.pyi
@@ -14,10 +14,13 @@ class ExecuTorchModule:
     def run_method(self, method_name: str, inputs: Sequence[Any]) -> List[Any]: ...
     # pyre-ignore[2, 3]: "Any" in parameter and return type annotations.
     def forward(self, inputs: Sequence[Any]) -> List[Any]: ...
+    # pyre-ignore[3]: "Any" in return type annotations.
+    def plan_execute(self) -> List[Any]: ...
     # Bundled program methods.
     def load_bundled_input(
         self, bundle: BundledModule, method_name: str, testset_idx: int
     ) -> None: ...
+    # pyre-ignore[3]: "Any" in return type annotations.
     def verify_result_with_bundled_expected_output(
         self,
         bundle: BundledModule,
@@ -25,7 +28,7 @@ class ExecuTorchModule:
         testset_idx: int,
         rtol: float = 1e-5,
         atol: float = 1e-8,
-    ) -> None: ...
+    ) -> List[Any]: ...
     def has_etdump(self) -> bool: ...
     def write_etdump_result_to_file(
         self, path: str, debug_buffer_path: Optional[str] = None
diff --git a/extension/runner_util/managed_tensor.h b/extension/runner_util/managed_tensor.h
index d92f8d19be..5e2fb62c6f 100644
--- a/extension/runner_util/managed_tensor.h
+++ b/extension/runner_util/managed_tensor.h
@@ -37,39 +37,29 @@ class ManagedTensor {
   using DimOrderType = exec_aten::DimOrderType;
   /// The type used for elements of `strides()`.
   using StridesType = exec_aten::StridesType;
+
   ManagedTensor() = delete;
 
   explicit ManagedTensor(
       void* data,
       const std::vector<SizesType>& sizes,
       ScalarType dtype)
-      : dtype_(dtype), sizes_(sizes), data_ptr_(data) {
+      : sizes_(sizes) {
 #ifdef USE_ATEN_LIB
-    tensor_ = torch::from_blob(data, sizes, dtype_);
+    tensor_ = torch::from_blob(data, sizes, dtype);
 #else
-    ssize_t dim = sizes.size();
-    dim_order_.resize(dim);
-    strides_.resize(dim);
-    for (size_t i = 0; i < dim; ++i) {
-      dim_order_[i] = i;
-    }
-    dim_order_to_stride_nocheck(
-        sizes.data(), dim_order_.data(), dim, strides_.data());
     tensor_impl_ = std::make_unique<TensorImpl>(
-        dtype_,
-        dim,
+        dtype,
+        sizes_.size(),
         sizes_.data(),
-        data_ptr_,
-        dim_order_.data(),
-        strides_.data(),
+        data,
+        nullptr,
+        nullptr,
         TensorShapeDynamism::DYNAMIC_BOUND);
 #endif
   }
 
   void resize(const std::vector<SizesType>& new_sizes) {
-    ET_CHECK_MSG(
-        new_sizes.size() == sizes_.size(),
-        "Cannot change rank of a managed tensor");
     auto err = resize_tensor(
         this->get_aliasing_tensor(),
         exec_aten::ArrayRef<SizesType>(new_sizes.data(), new_sizes.size()));
@@ -88,15 +78,12 @@ class ManagedTensor {
   }
 
  private:
-  ScalarType dtype_;
   std::unique_ptr<TensorImpl> tensor_impl_;
   std::vector<SizesType> sizes_;
-  std::vector<StridesType> strides_;
-  std::vector<DimOrderType> dim_order_;
-  void* data_ptr_ = nullptr;
 #ifdef USE_ATEN_LIB
   Tensor tensor_;
 #endif
 };
+
 } // namespace executor
 } // namespace torch
diff --git a/extension/runner_util/test/managed_tensor_test.cpp b/extension/runner_util/test/managed_tensor_test.cpp
index 9c14553ed8..d5234570f4 100644
--- a/extension/runner_util/test/managed_tensor_test.cpp
+++ b/extension/runner_util/test/managed_tensor_test.cpp
@@ -42,15 +42,6 @@ TEST_F(ManagedTensorTest, Smoke) {
 
   EXPECT_EQ(tensor.sizes(), ArrayRef<SizesType>(sizes_.data(), sizes_.size()));
   EXPECT_EQ(tensor.scalar_type(), ScalarType::Long);
-  std::vector<DimOrderType> expected_dim_order = {0, 1};
-  EXPECT_EQ(
-      tensor.dim_order(),
-      ArrayRef<DimOrderType>(
-          expected_dim_order.data(), expected_dim_order.size()));
-  std::vector<StridesType> expected_strides = {3, 1};
-  EXPECT_EQ(
-      tensor.strides(),
-      ArrayRef<StridesType>(expected_strides.data(), expected_strides.size()));
   EXPECT_EQ(tensor.const_data_ptr(), data_.data());
 }
 
@@ -74,15 +65,6 @@ TEST_F(ManagedTensorTest, ResizeShrink) {
       tensor.sizes(),
       ArrayRef<SizesType>(expected_sizes.data(), expected_sizes.size()));
   EXPECT_EQ(tensor.scalar_type(), ScalarType::Long);
-  std::vector<DimOrderType> expected_dim_order = {0, 1};
-  EXPECT_EQ(
-      tensor.dim_order(),
-      ArrayRef<DimOrderType>(
-          expected_dim_order.data(), expected_dim_order.size()));
-  std::vector<StridesType> expected_strides = {2, 1};
-  EXPECT_EQ(
-      tensor.strides(),
-      ArrayRef<StridesType>(expected_strides.data(), expected_strides.size()));
   EXPECT_EQ(tensor.const_data_ptr(), data_.data());
 }
 
@@ -95,14 +77,5 @@ TEST_F(ManagedTensorTest, Resize) {
       tensor.sizes(),
       ArrayRef<SizesType>(expected_sizes.data(), expected_sizes.size()));
   EXPECT_EQ(tensor.scalar_type(), ScalarType::Long);
-  std::vector<DimOrderType> expected_dim_order = {0, 1};
-  EXPECT_EQ(
-      tensor.dim_order(),
-      ArrayRef<DimOrderType>(
-          expected_dim_order.data(), expected_dim_order.size()));
-  std::vector<StridesType> expected_strides = {2, 1};
-  EXPECT_EQ(
-      tensor.strides(),
-      ArrayRef<StridesType>(expected_strides.data(), expected_strides.size()));
   EXPECT_EQ(tensor.const_data_ptr(), data_.data());
 }
diff --git a/runtime/core/exec_aten/testing_util/tensor_factory.h b/runtime/core/exec_aten/testing_util/tensor_factory.h
index 146d342692..55f4c56668 100644
--- a/runtime/core/exec_aten/testing_util/tensor_factory.h
+++ b/runtime/core/exec_aten/testing_util/tensor_factory.h
@@ -564,7 +564,8 @@ namespace internal {
 // values while using the defaults for everything else.
 template <torch::executor::ScalarType DTYPE>
 struct ScalarTypeToCppTypeWrapper {
-  using ctype = typename executorch::runtime::ScalarTypeToCppType<DTYPE>::type;
+  using ctype =
+      typename ::executorch::runtime::ScalarTypeToCppType<DTYPE>::type;
 };
 
 // Use a C type of `uint8_t` instead of `bool`. The C type will be used to
diff --git a/runtime/core/portable_type/tensor_impl.cpp b/runtime/core/portable_type/tensor_impl.cpp
index 788a9116c6..fe6b57ea35 100644
--- a/runtime/core/portable_type/tensor_impl.cpp
+++ b/runtime/core/portable_type/tensor_impl.cpp
@@ -8,8 +8,8 @@
 
 #include <executorch/runtime/core/portable_type/tensor_impl.h>
 
+#include <algorithm>
 #include <cstdint>
-#include <cstring> // std::memcpy
 
 #include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
@@ -25,11 +25,11 @@ namespace {
  * Compute the number of elements based on the sizes of a tensor.
  */
 ssize_t compute_numel(const TensorImpl::SizesType* sizes, ssize_t dim) {
-  ssize_t n = 1;
-  for (ssize_t i = 0; i < dim; i++) {
-    n *= sizes[i];
+  ssize_t numel = 1; // Zero-dimensional tensors (scalars) have numel == 1.
+  for (ssize_t i = 0; i < dim; ++i) {
+    numel *= sizes[i];
   }
-  return n;
+  return numel;
 }
 } // namespace
 
@@ -67,7 +67,7 @@ Error TensorImpl::internal_resize_contiguous(ArrayRef<SizesType> new_sizes) {
   ET_CHECK_OR_RETURN_ERROR(
       new_sizes.size() == dim_,
       NotSupported,
-      "ETensor rank is immutable old: %zu new: %zu",
+      "Attempted to change the tensor rank which is immutable: old=%zu, new=%zu",
       dim_,
       new_sizes.size());
 
@@ -82,55 +82,33 @@ Error TensorImpl::internal_resize_contiguous(ArrayRef<SizesType> new_sizes) {
   if (dim_ == 0) {
     return Error::Ok;
   }
-
-  // Can only resize a StaticShape Tensor to the same size
-  if (shape_dynamism_ == TensorShapeDynamism::STATIC) {
-    for (int i = 0; i < new_sizes.size(); i++) {
+  switch (shape_dynamism_) {
+    case TensorShapeDynamism::STATIC:
       ET_CHECK_OR_RETURN_ERROR(
-          new_sizes[i] == sizes_[i],
+          std::equal(sizes_, sizes_ + dim_, new_sizes.begin()),
           NotSupported,
-          "Attempted to resize a static tensor to a new shape at "
-          "dimension %d old_size: %d new_size: %d",
-          i,
-          sizes_[i],
-          new_sizes[i]);
-    }
-    // no work to do after checking for error
-    return Error::Ok;
-  }
-
-  const auto new_numel = compute_numel(new_sizes.data(), dim_);
-
-  // Bounded tensors can be reshaped, but not beyond the upper bound.
-  if (shape_dynamism_ == TensorShapeDynamism::DYNAMIC_BOUND ||
+          "Attempted to resize a static tensor");
+      break;
+    case TensorShapeDynamism::DYNAMIC_BOUND:
       // TODO(T175194371): Unbounded dynamic tensor resizing is not yet
       // supported: treat them as upper-bounded.
-      shape_dynamism_ == TensorShapeDynamism::DYNAMIC_UNBOUND) {
-    ET_CHECK_OR_RETURN_ERROR(
-        new_numel <= numel_bound_,
-        NotSupported,
-        "Attempted to resize a bounded tensor with capacity of %zu elements to %zu elements.",
-        new_numel,
-        numel_bound_);
+    case TensorShapeDynamism::DYNAMIC_UNBOUND: {
+      const auto new_numel = compute_numel(new_sizes.data(), dim_);
+      ET_CHECK_OR_RETURN_ERROR(
+          new_numel <= numel_bound_,
+          NotSupported,
+          "Attempted to resize a bounded tensor with capacity of %zu elements to %zu elements.",
+          new_numel,
+          numel_bound_);
+
+      if (strides_ && dim_order_) {
+        ET_CHECK_OK_OR_RETURN_ERROR(
+            dim_order_to_stride(new_sizes.data(), dim_order_, dim_, strides_));
+      }
+      numel_ = new_numel;
+      std::copy(new_sizes.begin(), new_sizes.end(), sizes_);
+    }
   }
-
-  // Copy sizes over
-  std::memcpy(sizes_, new_sizes.data(), sizeof(SizesType) * dim_);
-
-  // Compute new strides
-  ET_CHECK_OR_RETURN_ERROR(
-      strides_ != nullptr, Internal, "Strides cannot be nullptr for resize");
-  ET_CHECK_OR_RETURN_ERROR(
-      dim_order_ != nullptr,
-      Internal,
-      "Dim order cannot be nullptr for resize");
-  auto status = dim_order_to_stride(sizes_, dim_order_, dim_, strides_);
-  ET_CHECK_OR_RETURN_ERROR(
-      status == Error::Ok,
-      Internal,
-      "dim_order_to_stride returned invalid status");
-  numel_ = new_numel;
-
   return Error::Ok;
 }
 
diff --git a/runtime/core/portable_type/test/tensor_impl_test.cpp b/runtime/core/portable_type/test/tensor_impl_test.cpp
index e7e9d1fcf6..9e8e9d2a43 100644
--- a/runtime/core/portable_type/test/tensor_impl_test.cpp
+++ b/runtime/core/portable_type/test/tensor_impl_test.cpp
@@ -267,6 +267,93 @@ TEST_F(TensorImplTest, TestSetSizesContigUnbounded) {
   EXPECT_NE(err, Error::Ok);
 }
 
+TEST_F(TensorImplTest, TestDynamicTensorNoStridesDimOrder) {
+  SizesType sizes[3] = {2, 3, 4};
+  float data[24] = {0};
+  TensorImpl t(
+      ScalarType::Float,
+      3,
+      sizes,
+      data,
+      nullptr,
+      nullptr,
+      TensorShapeDynamism::DYNAMIC_BOUND);
+
+  EXPECT_EQ(t.dim(), 3);
+  EXPECT_EQ(t.numel(), 24);
+  EXPECT_EQ(t.nbytes(), 24 * sizeof(float));
+
+  SizesType new_sizes[3] = {3, 2, 4};
+  Error err = resize_tensor_impl(&t, {new_sizes, 3});
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_EQ(t.dim(), 3);
+  EXPECT_EQ(t.size(0), 3);
+  EXPECT_EQ(t.size(1), 2);
+  EXPECT_EQ(t.size(2), 4);
+  EXPECT_EQ(t.numel(), 3 * 2 * 4);
+
+  const float* y = t.data<float>();
+  EXPECT_EQ(y, data);
+}
+
+TEST_F(TensorImplTest, TestDynamicTensorNoStridesDimOrderResizeDown) {
+  SizesType sizes[3] = {4, 4, 4};
+  float data[64] = {0};
+  TensorImpl t(
+      ScalarType::Float,
+      3,
+      sizes,
+      data,
+      nullptr,
+      nullptr,
+      TensorShapeDynamism::DYNAMIC_BOUND);
+
+  EXPECT_EQ(t.dim(), 3);
+  EXPECT_EQ(t.numel(), 64);
+  EXPECT_EQ(t.nbytes(), 64 * sizeof(float));
+
+  SizesType new_sizes[3] = {2, 2, 2};
+  Error err = resize_tensor_impl(&t, {new_sizes, 3});
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_EQ(t.dim(), 3);
+  EXPECT_EQ(t.size(0), 2);
+  EXPECT_EQ(t.size(1), 2);
+  EXPECT_EQ(t.size(2), 2);
+  EXPECT_EQ(t.numel(), 2 * 2 * 2);
+
+  const float* y = t.data<float>();
+  EXPECT_EQ(y, data);
+}
+
+TEST_F(TensorImplTest, TestDynamicTensorNoStridesDimOrderResizeZeroDim) {
+  SizesType sizes[3] = {4, 4, 4};
+  float data[64] = {0};
+  TensorImpl t(
+      ScalarType::Float,
+      3,
+      sizes,
+      data,
+      nullptr,
+      nullptr,
+      TensorShapeDynamism::DYNAMIC_BOUND);
+
+  EXPECT_EQ(t.dim(), 3);
+  EXPECT_EQ(t.numel(), 64);
+  EXPECT_EQ(t.nbytes(), 64 * sizeof(float));
+
+  SizesType new_sizes[3] = {0, 4, 4};
+  Error err = resize_tensor_impl(&t, {new_sizes, 3});
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_EQ(t.dim(), 3);
+  EXPECT_EQ(t.size(0), 0);
+  EXPECT_EQ(t.size(1), 4);
+  EXPECT_EQ(t.size(2), 4);
+  EXPECT_EQ(t.numel(), 0);
+
+  const float* y = t.data<float>();
+  EXPECT_EQ(y, data);
+}
+
 TEST_F(TensorImplTest, TestWriteRead) {
   SizesType sizes[1] = {1};
   DimOrderType dim_order[1] = {0};
diff --git a/runtime/executor/program.cpp b/runtime/executor/program.cpp
index becdf5df5c..6a889625c6 100644
--- a/runtime/executor/program.cpp
+++ b/runtime/executor/program.cpp
@@ -410,5 +410,90 @@ Result<FreeableBuffer> Program::LoadSegment(
       segment_base_offset_ + segment->offset(), segment->size(), segment_info);
 }
 
+Error Program::load_mutable_subsegment_into(
+    size_t mutable_data_segments_index,
+    size_t offset_index,
+    size_t size,
+    void* buffer) const {
+  EXECUTORCH_SCOPE_PROF("Program::load_subsegment_into");
+  // Check that the program has segments.
+  if (loader_ == nullptr || segment_base_offset_ == 0) {
+    ET_LOG(Error, "No segments in program");
+    return Error::NotFound;
+  }
+
+  // Check that the program has mutable data segments.
+  if (internal_program_->mutable_data_segments() == nullptr) {
+    ET_LOG(Error, "No mutable data segments in program");
+    return Error::NotFound;
+  }
+  if (mutable_data_segments_index >=
+      internal_program_->mutable_data_segments()->size()) {
+    ET_LOG(
+        Error,
+        "mutable_data_segments_index %zu out of range >= %" PRIu64,
+        mutable_data_segments_index,
+        (uint64_t)internal_program_->mutable_data_segments()->size());
+    return Error::NotFound;
+  }
+
+  // Grab the mutable data segment info.
+  const auto& segment_offsets = internal_program_->mutable_data_segments()->Get(
+      mutable_data_segments_index);
+
+  // Check that the offset is valid.
+  if (segment_offsets->offsets() == nullptr) {
+    ET_LOG(Error, "No offsets in mutable data segment");
+    return Error::NotFound;
+  }
+  if (offset_index >= segment_offsets->offsets()->size()) {
+    ET_LOG(
+        Error,
+        "offset index %zu out of range >= %" PRIu64,
+        offset_index,
+        (uint64_t)segment_offsets->offsets()->size());
+    return Error::NotFound;
+  }
+
+  // Grab the offset. Note: This offset is relative to the start of the segment,
+  // so we will need to adjust when calling the loader.
+  size_t offset = segment_offsets->offsets()->Get(offset_index);
+
+  // Grab the segment index
+  size_t num_segments = internal_program_->segments()->size();
+  if (segment_offsets->segment_index() >= num_segments) {
+    ET_LOG(
+        Error,
+        "Segment index %u out of range (>= %zu)",
+        segment_offsets->segment_index(),
+        num_segments);
+    return Error::NotFound;
+  }
+
+  // Grab the segment
+  auto segment =
+      internal_program_->segments()->Get(segment_offsets->segment_index());
+
+  // Check size
+  if (offset + size > segment->size()) {
+    ET_LOG(
+        Error,
+        "offset %zu + size %zu out of range > %" PRIu64,
+        offset,
+        size,
+        segment->size());
+    return Error::InvalidArgument;
+  }
+
+  DataLoader::SegmentInfo info = DataLoader::SegmentInfo(
+      DataLoader::SegmentInfo::Type::Mutable,
+      segment_offsets->segment_index(),
+      nullptr);
+
+  // Load the data
+  return loader_->load_into(
+      segment_base_offset_ + segment->offset() + offset, size, info, buffer);
+}
+
 } // namespace runtime
 } // namespace executorch
diff --git a/runtime/executor/program.h b/runtime/executor/program.h
index eb739943a0..1264863255 100644
--- a/runtime/executor/program.h
+++ b/runtime/executor/program.h
@@ -223,6 +223,30 @@ class Program final {
   __ET_NODISCARD Result<FreeableBuffer> LoadSegment(
       const DataLoader::SegmentInfo& segment_info) const;
 
+  /**
+   * Loads a portion of a mutable segment into the provided buffer.
+   *
+   * @param[in] mutable_data_segments_index The index into the
+   * mutable_data_segments_array.
+   * @param[in] offset_index The index into the segment's offsets array.
+   * @param[in] size The number of bytes to load.
+   * @param[in] buffer The buffer to load data into. Must point to at least
+   * `size` bytes of memory.
+   *
+   * @returns An error code on if the load was successful.
+   * @retval Error::Ok The load was successful.
+   * @retval Error::NotFound The program does not contain any segments or the
+   *     indices are out of range.
+   * @returns Other errors depending on the implementation of
+   *     DataLoader: The Program.segment table is inconsistent, or the
+   *     data cannot be accessed.
+   */
+  __ET_NODISCARD Error load_mutable_subsegment_into(
+      size_t mutable_data_segments_index,
+      size_t offset_index,
+      size_t size,
+      void* buffer) const;
+
  private:
   Program(
       DataLoader* loader,
diff --git a/runtime/executor/test/program_test.cpp b/runtime/executor/test/program_test.cpp
index 543627793d..7ff224355b 100644
--- a/runtime/executor/test/program_test.cpp
+++ b/runtime/executor/test/program_test.cpp
@@ -63,7 +63,7 @@ class ProgramTest : public ::testing::Test {
 
     add_loader_ = std::make_unique<FileDataLoader>(std::move(loader.get()));
 
-    // Load the serialized ModuleAdd data.
+    // Load the serialized ModuleMultiEntry data.
     path = std::getenv("ET_MODULE_MULTI_ENTRY_PATH");
     Result<FileDataLoader> multi_loader = FileDataLoader::from(path);
     ASSERT_EQ(multi_loader.error(), Error::Ok);
@@ -99,6 +99,16 @@ class ProgramTestFriend final {
     return program->LoadSegment(segment_info);
   }
 
+  __ET_NODISCARD static Error load_mutable_subsegment_into(
+      const Program* program,
+      size_t mutable_data_segments_index,
+      size_t offset_index,
+      size_t size,
+      void* buffer) {
+    return program->load_mutable_subsegment_into(
+        mutable_data_segments_index, offset_index, size, buffer);
+  }
+
   const static executorch_flatbuffer::Program* GetInternalProgram(
       const Program* program) {
     return program->internal_program_;
@@ -445,3 +455,89 @@ TEST_F(ProgramTest, LoadConstantSegmentWithNoConstantSegment) {
   // The constant buffer should exist.
   EXPECT_GE(flatbuffer_program->constant_buffer()->size(), 1);
 }
+
+TEST_F(ProgramTest, LoadFromMutableSegment) {
+  // Load the serialized ModuleSimpleTrain data.
+  auto path = std::getenv("ET_MODULE_SIMPLE_TRAIN_PATH");
+  Result<FileDataLoader> training_loader = FileDataLoader::from(path);
+  ASSERT_EQ(training_loader.error(), Error::Ok);
+
+  // This file should always be compatible.
+  Result<FreeableBuffer> training_header = training_loader->load(
+      /*offset=*/0,
+      Program::kMinHeadBytes,
+      DataLoader::SegmentInfo(DataLoader::SegmentInfo::Type::Program));
+  ASSERT_EQ(training_header.error(), Error::Ok);
+  EXPECT_EQ(
+      Program::check_header(training_header->data(), training_header->size()),
+      Program::HeaderStatus::CompatibleVersion);
+
+  Result<Program> program = Program::load(&training_loader.get());
+  ASSERT_EQ(program.error(), Error::Ok);
+
+  // dummy buffers to load into
+  uint8_t buffer[1] = {0};
+  uint8_t buffer2[1] = {0};
+
+  // Load some mutable segment data
+  Error err = ProgramTestFriend::load_mutable_subsegment_into(
+      &program.get(), 0, 1, 1, buffer);
+  EXPECT_EQ(err, Error::Ok);
+
+  // Check that the data loaded correctly, and then mutate it
+  EXPECT_EQ(buffer[0], 232); // 232 comes from inspecting the file itself. The
+                             // file is seeded so this value should be stable.
+  buffer[0] = 0;
+
+  // Load the same mutable segment data from file into a different buffer.
+  err = ProgramTestFriend::load_mutable_subsegment_into(
+      &program.get(),
+      0, // mutable_data_segments_index
+      1, // offset_index
+      1, // size
+      buffer2);
+  EXPECT_EQ(err, Error::Ok);
+
+  // Check that new data loaded from the file does not reflect the change to
+  // buffer.
+  EXPECT_EQ(buffer2[0], 232);
+
+  const executorch_flatbuffer::Program* flatbuffer_program =
+      ProgramTestFriend::GetInternalProgram(&program.get());
+
+  // Expect 1 segment. 1 mutable segment and no constant segment.
+  EXPECT_EQ(flatbuffer_program->segments()->size(), 1);
+
+  // Expect a mutable data segment.
+  EXPECT_EQ(flatbuffer_program->mutable_data_segments()->size(), 1);
+
+  // Expect the 0 index to be reserved and the offsets for weight and bias of
+  // linear to be indices 1 and 2.
+  EXPECT_EQ(
+      flatbuffer_program->mutable_data_segments()->Get(0)->offsets()->size(),
+      3);
+  EXPECT_EQ(
+      flatbuffer_program->mutable_data_segments()->Get(0)->offsets()->Get(0),
+      0);
+  EXPECT_EQ(
+      flatbuffer_program->mutable_data_segments()->Get(0)->offsets()->Get(1),
+      0);
+  EXPECT_EQ(
+      flatbuffer_program->mutable_data_segments()->Get(0)->offsets()->Get(2),
+      36);
+
+  // Loading beyond file should fail
+  err = ProgramTestFriend::load_mutable_subsegment_into(
+      &program.get(), 0, 1, 500, buffer);
+  EXPECT_NE(err, Error::Ok);
+
+  // Loading beyond offsets should fail
+  err = ProgramTestFriend::load_mutable_subsegment_into(
+      &program.get(), 0, 500, 1, buffer);
+  EXPECT_NE(err, Error::Ok);
+
+  // Loading beyond segments should fail
+  err = ProgramTestFriend::load_mutable_subsegment_into(
+      &program.get(), 500, 1, 1, buffer);
+  EXPECT_NE(err, Error::Ok);
+}
diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl
index eaec540c27..401581421d 100644
--- a/runtime/executor/test/targets.bzl
+++ b/runtime/executor/test/targets.bzl
@@ -107,6 +107,7 @@ def define_common_targets(is_fbcode = False):
             "ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleLinear-no-constant-segment.pte])",
             "ET_MODULE_LINEAR_CONSTANT_SEGMENT_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleLinear.pte])",
             "ET_MODULE_MULTI_ENTRY_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleMultipleEntry.pte])",
+            "ET_MODULE_SIMPLE_TRAIN_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleSimpleTrain.pte])",
         }
 
         runtime.cxx_test(
diff --git a/runtime/kernel/kernel_runtime_context.h b/runtime/kernel/kernel_runtime_context.h
index fba6d8e1c1..912e3e1113 100644
--- a/runtime/kernel/kernel_runtime_context.h
+++ b/runtime/kernel/kernel_runtime_context.h
@@ -121,10 +121,10 @@ using ::executorch::runtime::KernelRuntimeContext;
 // TODO(T147221312): Remove these aliases once all code uses
 // KernelRuntimeContext.
 namespace exec_aten {
-using RuntimeContext = executorch::runtime::KernelRuntimeContext;
+using RuntimeContext = ::executorch::runtime::KernelRuntimeContext;
 } // namespace exec_aten
 namespace torch {
 namespace executor {
-using RuntimeContext = executorch::runtime::KernelRuntimeContext;
+using RuntimeContext = ::executorch::runtime::KernelRuntimeContext;
 } // namespace executor
 } // namespace torch
diff --git a/sdk/CMakeLists.txt b/sdk/CMakeLists.txt
index 8f677000c8..79903fc315 100644
--- a/sdk/CMakeLists.txt
+++ b/sdk/CMakeLists.txt
@@ -62,6 +62,10 @@ set(FLATCC_REFLECTION
     OFF
     CACHE BOOL ""
 )
+set(FLATCC_DEBUG_CLANG_SANITIZE
+    OFF
+    CACHE BOOL ""
+)
 set(_flatcc_source_dir ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/flatcc)
 add_subdirectory(${_flatcc_source_dir} ${CMAKE_BINARY_DIR}/third-party/flatcc)