2024-07-19 nightly release (c757499)

pytorch · Jul 19, 2024 · 6f84be0 · 6f84be0
1 parent 65e810d
commit 6f84be0
Show file tree

Hide file tree

Showing 14 changed files with 323 additions and 119 deletions.
diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml
@@ -9,8 +9,7 @@ on:
     paths:
       - .ci/docker/**
       - .github/workflows/android.yml
-      - build/build_android_library.sh
-      - build/test_android_ci.sh
+      - build/*android*.sh
       - install_requirements.sh
       - examples/demo-apps/android/**
       - extension/android/**
@@ -22,15 +21,14 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  build-demo-android:
-    name: build-demo-android
+  build-llm-demo:
+    name: build-llm-demo
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     strategy:
       matrix:
-          tiktoken: [OFF, ON]
+          tokenizer: [bpe, tiktoken]
     with:
-      # NB: The example model dl3 requires lots of memory (T161064121)
-      runner: linux.12xlarge
+      runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-clang12-android
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -42,28 +40,39 @@ jobs:
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
-
-        # Setup MacOS dependencies as there is no Docker support on MacOS atm
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh buck2
-        # Build Android library
-        export EXECUTORCH_USE_TIKTOKEN=${{ matrix.tiktoken }}
-        bash build/build_android_library.sh
-        # Build Android demo app
-        bash build/test_android_ci.sh
+        export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
+
+        # Build LLM Demo for Android
+        bash build/build_android_llm_demo.sh ${{ matrix.tokenizer }} ${ARTIFACTS_DIR_NAME}
+  # Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat
+  upload-artifacts:
+    needs: build-llm-demo
+    runs-on: linux.2xlarge
+    steps:
+      - name: Download the artifacts from GitHub
+        uses: actions/download-artifact@v3
+        with:
+          # The name here needs to match the name of the upload-artifact parameter
+          name: android-apps
+          path: ${{ runner.temp }}/artifacts/
+
+      - name: Verify the artifacts
+        shell: bash
+        working-directory: ${{ runner.temp }}/artifacts/
+        run: |
+          ls -lah ./
 
-        mkdir -p artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN
-        mkdir -p artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/arm64-v8a/
-        mkdir -p artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/x86_64/
-        # Copy the jar to S3
-        cp extension/android/build/libs/executorch.jar artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/
-        # Copy the app and its test suite to S3
-        cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/debug/*.apk artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/
-        cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/androidTest/debug/*.apk artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/
-        # Also copy the libraries
-        cp cmake-out-android-arm64-v8a/lib/*.a artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/arm64-v8a/
-        cp cmake-out-android-arm64-v8a/extension/android/*.so artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/arm64-v8a/
-        cp cmake-out-android-x86_64/lib/*.a artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/x86_64/
-        cp cmake-out-android-x86_64/extension/android/*.so artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/x86_64/
-        # Copyp AAR to S3
-        cp executorch.aar artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/
-        cp executorch-llama.aar artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/
+      - name: Upload the artifacts to S3
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/artifact
+          # NOTE: Consume stale artifacts won't make sense for benchmarking as the goal is always to
+          # benchmark models as fresh as possible. I'm okay to keep the 14 retention-days for now
+          # for TorchChat until we have a periodic job can publish it more often. Ideally I want to
+          # reduce it to <= 2 day, meaning the benchmark job will run daily.
+          retention-days: 14
+          if-no-files-found: ignore
+          path: ${{ runner.temp }}/artifacts/
diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py
@@ -231,6 +231,7 @@ def layout_declare_tensor(
     var_name: str,
     dtype: str,
     storage_type: str,
+    is_scalar_array: bool = False,
     precision: str = "PRECISION",
 ) -> str:
     assert storage_type.lower() in ["buffer", "texture3d", "texture2d"]
@@ -242,7 +243,12 @@ def layout_declare_tensor(
     # Create buffer binding
     if storage_type.lower() == "buffer":
         return layout_declare_buffer(
-            slot, access_type, var_name, dtype, precision, is_scalar_array=False
+            slot,
+            access_type,
+            var_name,
+            dtype,
+            precision,
+            is_scalar_array=is_scalar_array,
         )
 
     # Create image/sampler binding
@@ -533,7 +539,7 @@ def generateVariantCombinations(
                                 curr_suffix = (
                                     suffix + "_" + str(i) if suffix else str(i)
                                 )
-                                param_values.append((param_name, curr_suffix, str(i)))
+                                param_values.append((param_name, curr_suffix, i))
                         else:
                             raise ValueError(
                                 f"{value['RANGE']} is not a valid range. Must be in format [start, end] (inclusive)."
@@ -595,7 +601,7 @@ def parseTemplateYaml(self, yaml_file: str) -> None:
                             variant_name = variant["NAME"]
                             for param_value in combination:
                                 default_params_copy[param_value[0]] = param_value[2]
-                                if len(param_value[1]) > 0:
+                                if len(str(param_value[1])) > 0:
                                     variant_name = f"{variant_name}_{param_value[1]}"
 
                             default_params_copy["NAME"] = variant_name

diff --git a/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl b/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+layout(std430) buffer;
+
+$if MEMTYPE == "ubo":
+    ${layout_declare_ubo(0, "vec4", "A")}
+$elif MEMTYPE == "buffer":
+    ${layout_declare_buffer(0, "r", "A", DTYPE, "PRECISION", False)}
+$else:
+    ${layout_declare_buffer(0, "r", "_", DTYPE, "PRECISION", False)}
+
+${layout_declare_buffer(1, "w", "B", DTYPE, "PRECISION", False)}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int niter = 1;
+layout(constant_id = 4) const int nvec = 1;
+layout(constant_id = 5) const int local_group_size = 1;
+
+$if MEMTYPE == "shared":
+    shared vec4 A[nvec];
+
+void main() {
+
+    $if MEMTYPE == "shared":
+        A[gl_LocalInvocationID[0]][0] = gl_LocalInvocationID[0];
+        memoryBarrierShared();
+
+    // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
+    // This will help us limit address accessing to a specific set of unique
+    // addresses depending on the access size we want to measure.
+    const int addr_mask = nvec - 1;
+    vec4 sum = vec4(0);
+
+    // This is to distribute the accesses to unique addresses across the workgroups, once the
+    // size of the access excedes the workgroup width.
+    const uint workgroup_width = local_group_size * niter * ${NUNROLL};
+    uint offset = (gl_WorkGroupID[0] * workgroup_width  + gl_LocalInvocationID[0]) & addr_mask;
+
+    int i = 0;
+    for (; i < niter; ++i){
+      $for j in range(int(NUNROLL)):
+          sum *= A[offset];
+
+          // On each unroll, a new unique address will be accessed through the offset,
+          // limited by the address mask to a specific set of unique addresses
+          offset = (offset + local_group_size) & addr_mask;
+    }
+
+    // This is to ensure no compiler optimizations occur
+    vec4 zero = vec4(i>>31);
+
+    B[gl_LocalInvocationID[0]] = sum + zero;
+}
diff --git a/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.yaml b/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.yaml
@@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+buf_bandwidth:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: buffer
+    NUNROLL: "16"
+  generate_variant_forall:
+    MEMTYPE:
+      - VALUE: ubo
+      - VALUE: buffer
+      - VALUE: shared
+  shader_variants:
+    - NAME: buf_bandwidth
diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp
@@ -7,7 +7,6 @@
  */
 
 #include <executorch/backends/vulkan/runtime/api/api.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
 #include <iostream>
 
 #include "stats.h"
@@ -18,6 +17,7 @@ using namespace vkapi;
 class App {
  private:
   size_t buf_cache_size_;
+  uint32_t max_shared_mem_size_;
   uint32_t sm_count_;
   uint32_t nthread_logic_;
 
@@ -33,11 +33,12 @@ class App {
     sm_count_ = cl_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
     nthread_logic_ = cl_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
     buf_cache_size_ = cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();
-
+    max_shared_mem_size_ = cl_device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
     std::cout << std::endl;
     std::cout << "SM count," << sm_count_ << std::endl;
     std::cout << "Logic Thread Count," << nthread_logic_ << std::endl;
     std::cout << "Cache Size," << buf_cache_size_ << std::endl;
+    std::cout << "Shared Memory Size," << max_shared_mem_size_ << std::endl;
   }
 
   void reg_count() {
@@ -58,9 +59,7 @@ class App {
     uint32_t NITER;
 
     auto bench = [&](uint32_t ngrp, uint32_t nreg) {
-      size_t len = sizeof(float);
-      StorageBuffer buffer(context(), vkapi::kFloat, len);
-      ParamsBuffer params(context(), int32_t(len));
+      StorageBuffer buffer(context(), vkapi::kFloat, 1);
       vkapi::PipelineBarrier pipeline_barrier{};
 
       auto shader_name = "reg_count_" + std::to_string(nreg);
@@ -74,8 +73,7 @@ class App {
             {SV(NITER)},
             VK_NULL_HANDLE,
             0,
-            buffer.buffer(),
-            params.buffer());
+            buffer.buffer());
       });
       return time;
     };
@@ -167,9 +165,8 @@ class App {
     uint32_t NITER;
 
     auto bench = [&](int stride) {
-      size_t len = sizeof(float);
       StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE);
-      StorageBuffer out_buf(context(), vkapi::kFloat, len);
+      StorageBuffer out_buf(context(), vkapi::kFloat, 1);
       vkapi::PipelineBarrier pipeline_barrier{};
 
       auto shader_name = "buf_cacheline_size";
@@ -213,6 +210,109 @@ class App {
 
     std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl;
   }
+
+ private:
+  void _bandwidth(std::string memtype, uint32_t range) {
+    // TODO: Make these values configurable
+    // Cache lines flushed
+    const uint32_t NFLUSH = 4;
+    // Number of loop unrolls. Changing this value requires an equal change in
+    // buf_bandwidth.yaml
+    const uint32_t NUNROLL = 16;
+    // Number of iterations. Increasing this value reduces noise in exchange for
+    // higher latency.
+    const uint32_t NITER = 10;
+    // Vector dimensions (vec4)
+    const uint32_t VEC_WIDTH = 4;
+    const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
+    // Number of vectors that fit in the selected memory space
+    const uint32_t NVEC = range / VEC_SIZE;
+    // Number of memory reads per thread
+    const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
+    // Number of threads needed to read al l vectors
+    // The thread count doesn't divide by thread workload in shared memory
+    // because of the limited memory size.
+    const uint32_t NTHREAD =
+        memtype == "Shared" ? NVEC : NVEC / NREAD_PER_THREAD;
+    // Occupy all threads
+    const uint32_t local_x = nthread_logic_;
+    // Ensure that global is a multiple of local, and distribute across all SMs
+    const uint32_t global_x =
+        (NTHREAD / local_x * local_x) * sm_count_ * NFLUSH;
+
+    auto bench = [&](uint32_t access_size) {
+      // Number of vectors that fit in this iteration
+      const uint32_t nvec_access = access_size / VEC_SIZE;
+
+      StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float));
+      StorageBuffer out_buf(
+          context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_);
+      vkapi::PipelineBarrier pipeline_barrier{};
+
+      auto memtype_lower = memtype;
+      std::transform(
+          memtype_lower.begin(),
+          memtype_lower.end(),
+          memtype_lower.begin(),
+          [](unsigned char c) { return std::tolower(c); });
+      auto shader_name = "buf_bandwidth_" + memtype_lower;
+
+      auto time = benchmark_on_gpu(shader_name, 10, [&]() {
+        context()->submit_compute_job(
+            VK_KERNEL_FROM_STR(shader_name),
+            pipeline_barrier,
+            {global_x, 1, 1},
+            {local_x, 1, 1},
+            {SV(NITER), SV(nvec_access), SV(local_x)},
+            VK_NULL_HANDLE,
+            0,
+            in_buf.buffer(),
+            out_buf.buffer());
+      });
+
+      const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
+      auto gbps = SIZE_TRANS * 1e-3 / time;
+      std::cout << memtype << " bandwidth accessing \t" << access_size
+                << "\tB unique data is \t" << gbps << " \tgbps (\t" << time
+                << "\tus)" << std::endl;
+      return gbps;
+    };
+
+    double max_bandwidth = 0;
+    double min_bandwidth = DBL_MAX;
+    for (uint32_t access_size = VEC_SIZE; access_size < range;
+         access_size *= 2) {
+      double gbps = bench(access_size);
+      max_bandwidth = std::max(gbps, max_bandwidth);
+      min_bandwidth = std::min(gbps, min_bandwidth);
+    }
+
+    std::cout << "Max" << memtype << "Bandwidth (GB/s)," << max_bandwidth
+              << std::endl;
+    std::cout << "Min" << memtype << "Bandwidth (GB/s)," << min_bandwidth
+              << std::endl;
+  }
+
+ public:
+  void buf_bandwidth() {
+    std::cout << "\n------ Memory Bandwidth ------" << std::endl;
+    // Maximum memory space read - 128MB
+    // For regular devices, bandwidth plateaus at less memory than this, so more
+    // is not needed.
+    const uint32_t RANGE = 128 * 1024 * 1024;
+    _bandwidth("Buffer", RANGE);
+  }
+
+  void ubo_bandwidth() {
+    std::cout << "\n------ UBO Bandwidth ------" << std::endl;
+    const uint32_t RANGE = 128 * 1024 * 1024;
+    _bandwidth("UBO", RANGE);
+  }
+  void shared_mem_bandwidth() {
+    std::cout << "\n------ Shared Bandwidth ------" << std::endl;
+    const uint32_t RANGE = max_shared_mem_size_;
+    _bandwidth("Shared", RANGE);
+  }
 };
 
 int main(int argc, const char** argv) {
@@ -221,6 +321,9 @@ int main(int argc, const char** argv) {
   // TODO: Allow user to skip tests
   app.reg_count();
   app.buf_cacheline_size();
+  app.buf_bandwidth();
+  app.ubo_bandwidth();
+  app.shared_mem_bandwidth();
 
   return 0;
 }