Skip to content

Commit

Permalink
2024-07-19 nightly release (c757499)
Browse files Browse the repository at this point in the history
  • Loading branch information
pytorchbot committed Jul 19, 2024
1 parent 65e810d commit 6f84be0
Show file tree
Hide file tree
Showing 14 changed files with 323 additions and 119 deletions.
69 changes: 39 additions & 30 deletions .github/workflows/android.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@ on:
paths:
- .ci/docker/**
- .github/workflows/android.yml
- build/build_android_library.sh
- build/test_android_ci.sh
- build/*android*.sh
- install_requirements.sh
- examples/demo-apps/android/**
- extension/android/**
Expand All @@ -22,15 +21,14 @@ concurrency:
cancel-in-progress: true

jobs:
build-demo-android:
name: build-demo-android
build-llm-demo:
name: build-llm-demo
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
strategy:
matrix:
tiktoken: [OFF, ON]
tokenizer: [bpe, tiktoken]
with:
# NB: The example model dl3 requires lots of memory (T161064121)
runner: linux.12xlarge
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-clang12-android
submodules: 'true'
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
Expand All @@ -42,28 +40,39 @@ jobs:
# The generic Linux job chooses to use base env, not the one setup by the image
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
conda activate "${CONDA_ENV}"
# Setup MacOS dependencies as there is no Docker support on MacOS atm
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh buck2
# Build Android library
export EXECUTORCH_USE_TIKTOKEN=${{ matrix.tiktoken }}
bash build/build_android_library.sh
# Build Android demo app
bash build/test_android_ci.sh
export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
# Build LLM Demo for Android
bash build/build_android_llm_demo.sh ${{ matrix.tokenizer }} ${ARTIFACTS_DIR_NAME}
# Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat
upload-artifacts:
needs: build-llm-demo
runs-on: linux.2xlarge
steps:
- name: Download the artifacts from GitHub
uses: actions/download-artifact@v3
with:
# The name here needs to match the name of the upload-artifact parameter
name: android-apps
path: ${{ runner.temp }}/artifacts/

- name: Verify the artifacts
shell: bash
working-directory: ${{ runner.temp }}/artifacts/
run: |
ls -lah ./
mkdir -p artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN
mkdir -p artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/arm64-v8a/
mkdir -p artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/x86_64/
# Copy the jar to S3
cp extension/android/build/libs/executorch.jar artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/
# Copy the app and its test suite to S3
cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/debug/*.apk artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/
cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/androidTest/debug/*.apk artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/
# Also copy the libraries
cp cmake-out-android-arm64-v8a/lib/*.a artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/arm64-v8a/
cp cmake-out-android-arm64-v8a/extension/android/*.so artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/arm64-v8a/
cp cmake-out-android-x86_64/lib/*.a artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/x86_64/
cp cmake-out-android-x86_64/extension/android/*.so artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/x86_64/
# Copyp AAR to S3
cp executorch.aar artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/
cp executorch-llama.aar artifacts-to-be-uploaded/tiktoken_$EXECUTORCH_USE_TIKTOKEN/
- name: Upload the artifacts to S3
uses: seemethere/upload-artifact-s3@v5
with:
s3-bucket: gha-artifacts
s3-prefix: |
${{ github.repository }}/${{ github.run_id }}/artifact
# NOTE: Consume stale artifacts won't make sense for benchmarking as the goal is always to
# benchmark models as fresh as possible. I'm okay to keep the 14 retention-days for now
# for TorchChat until we have a periodic job can publish it more often. Ideally I want to
# reduce it to <= 2 day, meaning the benchmark job will run daily.
retention-days: 14
if-no-files-found: ignore
path: ${{ runner.temp }}/artifacts/
12 changes: 9 additions & 3 deletions backends/vulkan/runtime/gen_vulkan_spv.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ def layout_declare_tensor(
var_name: str,
dtype: str,
storage_type: str,
is_scalar_array: bool = False,
precision: str = "PRECISION",
) -> str:
assert storage_type.lower() in ["buffer", "texture3d", "texture2d"]
Expand All @@ -242,7 +243,12 @@ def layout_declare_tensor(
# Create buffer binding
if storage_type.lower() == "buffer":
return layout_declare_buffer(
slot, access_type, var_name, dtype, precision, is_scalar_array=False
slot,
access_type,
var_name,
dtype,
precision,
is_scalar_array=is_scalar_array,
)

# Create image/sampler binding
Expand Down Expand Up @@ -533,7 +539,7 @@ def generateVariantCombinations(
curr_suffix = (
suffix + "_" + str(i) if suffix else str(i)
)
param_values.append((param_name, curr_suffix, str(i)))
param_values.append((param_name, curr_suffix, i))
else:
raise ValueError(
f"{value['RANGE']} is not a valid range. Must be in format [start, end] (inclusive)."
Expand Down Expand Up @@ -595,7 +601,7 @@ def parseTemplateYaml(self, yaml_file: str) -> None:
variant_name = variant["NAME"]
for param_value in combination:
default_params_copy[param_value[0]] = param_value[2]
if len(param_value[1]) > 0:
if len(str(param_value[1])) > 0:
variant_name = f"{variant_name}_{param_value[1]}"

default_params_copy["NAME"] = variant_name
Expand Down
64 changes: 64 additions & 0 deletions backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#version 450 core

#define PRECISION ${PRECISION}

layout(std430) buffer;

$if MEMTYPE == "ubo":
${layout_declare_ubo(0, "vec4", "A")}
$elif MEMTYPE == "buffer":
${layout_declare_buffer(0, "r", "A", DTYPE, "PRECISION", False)}
$else:
${layout_declare_buffer(0, "r", "_", DTYPE, "PRECISION", False)}

${layout_declare_buffer(1, "w", "B", DTYPE, "PRECISION", False)}

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

layout(constant_id = 3) const int niter = 1;
layout(constant_id = 4) const int nvec = 1;
layout(constant_id = 5) const int local_group_size = 1;

$if MEMTYPE == "shared":
shared vec4 A[nvec];

void main() {

$if MEMTYPE == "shared":
A[gl_LocalInvocationID[0]][0] = gl_LocalInvocationID[0];
memoryBarrierShared();

// The address mask works as a modulo because x % 2^n == x & (2^n - 1).
// This will help us limit address accessing to a specific set of unique
// addresses depending on the access size we want to measure.
const int addr_mask = nvec - 1;
vec4 sum = vec4(0);

// This is to distribute the accesses to unique addresses across the workgroups, once the
// size of the access excedes the workgroup width.
const uint workgroup_width = local_group_size * niter * ${NUNROLL};
uint offset = (gl_WorkGroupID[0] * workgroup_width + gl_LocalInvocationID[0]) & addr_mask;

int i = 0;
for (; i < niter; ++i){
$for j in range(int(NUNROLL)):
sum *= A[offset];

// On each unroll, a new unique address will be accessed through the offset,
// limited by the address mask to a specific set of unique addresses
offset = (offset + local_group_size) & addr_mask;
}

// This is to ensure no compiler optimizations occur
vec4 zero = vec4(i>>31);

B[gl_LocalInvocationID[0]] = sum + zero;
}
18 changes: 18 additions & 0 deletions backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

buf_bandwidth:
parameter_names_with_default_values:
DTYPE: float
STORAGE: buffer
NUNROLL: "16"
generate_variant_forall:
MEMTYPE:
- VALUE: ubo
- VALUE: buffer
- VALUE: shared
shader_variants:
- NAME: buf_bandwidth
121 changes: 112 additions & 9 deletions backends/vulkan/tools/gpuinfo/src/app.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
*/

#include <executorch/backends/vulkan/runtime/api/api.h>
#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
#include <iostream>

#include "stats.h"
Expand All @@ -18,6 +17,7 @@ using namespace vkapi;
class App {
private:
size_t buf_cache_size_;
uint32_t max_shared_mem_size_;
uint32_t sm_count_;
uint32_t nthread_logic_;

Expand All @@ -33,11 +33,12 @@ class App {
sm_count_ = cl_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
nthread_logic_ = cl_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
buf_cache_size_ = cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();

max_shared_mem_size_ = cl_device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
std::cout << std::endl;
std::cout << "SM count," << sm_count_ << std::endl;
std::cout << "Logic Thread Count," << nthread_logic_ << std::endl;
std::cout << "Cache Size," << buf_cache_size_ << std::endl;
std::cout << "Shared Memory Size," << max_shared_mem_size_ << std::endl;
}

void reg_count() {
Expand All @@ -58,9 +59,7 @@ class App {
uint32_t NITER;

auto bench = [&](uint32_t ngrp, uint32_t nreg) {
size_t len = sizeof(float);
StorageBuffer buffer(context(), vkapi::kFloat, len);
ParamsBuffer params(context(), int32_t(len));
StorageBuffer buffer(context(), vkapi::kFloat, 1);
vkapi::PipelineBarrier pipeline_barrier{};

auto shader_name = "reg_count_" + std::to_string(nreg);
Expand All @@ -74,8 +73,7 @@ class App {
{SV(NITER)},
VK_NULL_HANDLE,
0,
buffer.buffer(),
params.buffer());
buffer.buffer());
});
return time;
};
Expand Down Expand Up @@ -167,9 +165,8 @@ class App {
uint32_t NITER;

auto bench = [&](int stride) {
size_t len = sizeof(float);
StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE);
StorageBuffer out_buf(context(), vkapi::kFloat, len);
StorageBuffer out_buf(context(), vkapi::kFloat, 1);
vkapi::PipelineBarrier pipeline_barrier{};

auto shader_name = "buf_cacheline_size";
Expand Down Expand Up @@ -213,6 +210,109 @@ class App {

std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl;
}

private:
void _bandwidth(std::string memtype, uint32_t range) {
// TODO: Make these values configurable
// Cache lines flushed
const uint32_t NFLUSH = 4;
// Number of loop unrolls. Changing this value requires an equal change in
// buf_bandwidth.yaml
const uint32_t NUNROLL = 16;
// Number of iterations. Increasing this value reduces noise in exchange for
// higher latency.
const uint32_t NITER = 10;
// Vector dimensions (vec4)
const uint32_t VEC_WIDTH = 4;
const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
// Number of vectors that fit in the selected memory space
const uint32_t NVEC = range / VEC_SIZE;
// Number of memory reads per thread
const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
// Number of threads needed to read al l vectors
// The thread count doesn't divide by thread workload in shared memory
// because of the limited memory size.
const uint32_t NTHREAD =
memtype == "Shared" ? NVEC : NVEC / NREAD_PER_THREAD;
// Occupy all threads
const uint32_t local_x = nthread_logic_;
// Ensure that global is a multiple of local, and distribute across all SMs
const uint32_t global_x =
(NTHREAD / local_x * local_x) * sm_count_ * NFLUSH;

auto bench = [&](uint32_t access_size) {
// Number of vectors that fit in this iteration
const uint32_t nvec_access = access_size / VEC_SIZE;

StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float));
StorageBuffer out_buf(
context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_);
vkapi::PipelineBarrier pipeline_barrier{};

auto memtype_lower = memtype;
std::transform(
memtype_lower.begin(),
memtype_lower.end(),
memtype_lower.begin(),
[](unsigned char c) { return std::tolower(c); });
auto shader_name = "buf_bandwidth_" + memtype_lower;

auto time = benchmark_on_gpu(shader_name, 10, [&]() {
context()->submit_compute_job(
VK_KERNEL_FROM_STR(shader_name),
pipeline_barrier,
{global_x, 1, 1},
{local_x, 1, 1},
{SV(NITER), SV(nvec_access), SV(local_x)},
VK_NULL_HANDLE,
0,
in_buf.buffer(),
out_buf.buffer());
});

const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
auto gbps = SIZE_TRANS * 1e-3 / time;
std::cout << memtype << " bandwidth accessing \t" << access_size
<< "\tB unique data is \t" << gbps << " \tgbps (\t" << time
<< "\tus)" << std::endl;
return gbps;
};

double max_bandwidth = 0;
double min_bandwidth = DBL_MAX;
for (uint32_t access_size = VEC_SIZE; access_size < range;
access_size *= 2) {
double gbps = bench(access_size);
max_bandwidth = std::max(gbps, max_bandwidth);
min_bandwidth = std::min(gbps, min_bandwidth);
}

std::cout << "Max" << memtype << "Bandwidth (GB/s)," << max_bandwidth
<< std::endl;
std::cout << "Min" << memtype << "Bandwidth (GB/s)," << min_bandwidth
<< std::endl;
}

public:
void buf_bandwidth() {
std::cout << "\n------ Memory Bandwidth ------" << std::endl;
// Maximum memory space read - 128MB
// For regular devices, bandwidth plateaus at less memory than this, so more
// is not needed.
const uint32_t RANGE = 128 * 1024 * 1024;
_bandwidth("Buffer", RANGE);
}

void ubo_bandwidth() {
std::cout << "\n------ UBO Bandwidth ------" << std::endl;
const uint32_t RANGE = 128 * 1024 * 1024;
_bandwidth("UBO", RANGE);
}
void shared_mem_bandwidth() {
std::cout << "\n------ Shared Bandwidth ------" << std::endl;
const uint32_t RANGE = max_shared_mem_size_;
_bandwidth("Shared", RANGE);
}
};

int main(int argc, const char** argv) {
Expand All @@ -221,6 +321,9 @@ int main(int argc, const char** argv) {
// TODO: Allow user to skip tests
app.reg_count();
app.buf_cacheline_size();
app.buf_bandwidth();
app.ubo_bandwidth();
app.shared_mem_bandwidth();

return 0;
}
Loading

0 comments on commit 6f84be0

Please sign in to comment.