[llama][8/N] Add Multimodal Runner

ghstack-source-id: d72da351e7eb7ca69eeaf9241bc604aeaf3bc139 Pull Request resolved: #4354
pytorch · Jul 31, 2024 · 526a461 · 526a461
1 parent d0e65a8
commit 526a461
Show file tree

Hide file tree

Showing 9 changed files with 1,153 additions and 1 deletion.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -130,6 +130,10 @@ if(EXECUTORCH_ENABLE_EVENT_TRACER)
   add_definitions(-DET_EVENT_TRACER_ENABLED)
 endif()
 
+option(EXECUTORCH_DO_NOT_USE_CXX11_ABI "Define _GLIBCXX_USE_CXX11_ABI=0 if ON" OFF)
+if(EXECUTORCH_DO_NOT_USE_CXX11_ABI)
+  add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
+endif()
 # -ffunction-sections -fdata-sections: breaks function and data into sections so
 # they can be properly gc'd. -s: strip symbol. -fno-exceptions -fno-rtti:
 # disables exceptions and runtime type.

diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt
@@ -0,0 +1,234 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#
+# Simple CMake build system for multimodal runner.
+#
+# ### Editing this file ###
+#
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+cmake_minimum_required(VERSION 3.19)
+project(multimodal)
+
+# Duplicating options as root CMakeLists.txt
+option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" OFF)
+
+option(EXECUTORCH_USE_TIKTOKEN "Use Tiktoken as a tokenizer" OFF)
+
+include(CMakeDependentOption)
+#
+# pthreadpool: build pthreadpool library. Disable on unsupported platforms
+#
+cmake_dependent_option(
+  EXECUTORCH_BUILD_PTHREADPOOL "Build pthreadpool library." ON
+  "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF
+)
+#
+# cpuinfo: build cpuinfo library. Disable on unsupported platforms
+#
+cmake_dependent_option(
+  EXECUTORCH_BUILD_CPUINFO "Build cpuinfo library." ON
+  "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF
+)
+
+if(NOT PYTHON_EXECUTABLE)
+  set(PYTHON_EXECUTABLE python3)
+endif()
+
+set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+set(TORCH_ROOT ${EXECUTORCH_ROOT}/third-party/pytorch)
+
+include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+
+if(NOT PYTHON_EXECUTABLE)
+  resolve_python_executable()
+endif()
+
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+  # Can't set to 11 due to executor_runner.cpp make_unique
+endif()
+
+if(CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
+  set(CMAKE_TOOLCHAIN_IOS ON)
+else()
+  set(CMAKE_TOOLCHAIN_IOS OFF)
+endif()
+
+set(_common_compile_options -Wno-deprecated-declarations -fPIC)
+
+# Let files say "include <executorch/path/to/header.h>".
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+
+# For some reason android build is not able to find where gflags is and hence
+# cannot find corresponding .cmake file
+set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
+find_package(gflags REQUIRED)
+
+find_package(Torch CONFIG REQUIRED)
+add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
+
+#
+# llama_main: test binary to run llama, with tokenizer and sampler integrated
+#
+
+# find `executorch` libraries Same as for gflags
+set(executorch_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../lib/cmake/ExecuTorch)
+find_package(executorch CONFIG REQUIRED)
+if(CMAKE_TOOLCHAIN_IOS OR ANDROID)
+  target_link_options_shared_lib(executorch)
+endif()
+
+# custom ops library
+if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../llama2/custom_ops custom_ops)
+endif()
+
+# multimodal_runner library
+add_subdirectory(runner)
+if(EXECUTORCH_USE_TIKTOKEN)
+  # find RE2 for tokenizer
+  set(ABSL_ENABLE_INSTALL ON)
+  set(ABSL_PROPAGATE_CXX_STD ON)
+  set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
+  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+  add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/third-party/abseil-cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
+  )
+  add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/third-party/re2
+    ${CMAKE_CURRENT_BINARY_DIR}/re2
+  )
+  set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
+  target_link_libraries(multimodal_runner PUBLIC re2::re2)
+endif()
+
+set(link_libraries gflags torch)
+set(_srcs main.cpp)
+
+if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
+  list(
+    APPEND
+    link_libraries
+    optimized_native_cpu_ops_lib
+    optimized_kernels
+    portable_kernels
+    cpublas
+    eigen_blas
+  )
+  target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+else()
+  list(APPEND link_libraries portable_ops_lib portable_kernels)
+  target_link_options_shared_lib(portable_ops_lib)
+endif()
+
+# quantized_ops_lib: Register quantized op kernels into the runtime
+target_link_options_shared_lib(quantized_ops_lib)
+list(APPEND link_libraries quantized_kernels quantized_ops_lib)
+
+if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
+  target_link_options_shared_lib(custom_ops)
+  list(APPEND link_libraries custom_ops)
+endif()
+
+set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack)
+# Extra compile option and include dir for pthreadpool
+if(EXECUTORCH_BUILD_PTHREADPOOL)
+  list(APPEND _common_compile_options -DET_USE_THREADPOOL)
+  list(APPEND link_libraries pthreadpool)
+  # These 2 source files are included in xnnpack_backend
+  if(NOT TARGET xnnpack_backend)
+    list(APPEND _srcs ${XNNPACK_ROOT}/threadpool/threadpool.cpp
+         ${XNNPACK_ROOT}/threadpool/threadpool_guard.cpp
+    )
+  endif()
+  list(APPEND _common_include_directories
+       ${XNNPACK_ROOT}/third-party/pthreadpool/include
+  )
+endif()
+
+# Extra sources for cpuinfo
+if(EXECUTORCH_BUILD_CPUINFO)
+  list(APPEND link_libraries cpuinfo)
+  list(APPEND _srcs ${XNNPACK_ROOT}/threadpool/cpuinfo_utils.cpp)
+  list(APPEND _common_include_directories
+       ${XNNPACK_ROOT}/third-party/cpuinfo/include
+  )
+endif()
+
+# XNNPACK
+if(TARGET xnnpack_backend)
+  set(xnnpack_backend_libs xnnpack_backend XNNPACK)
+  list(APPEND link_libraries ${xnnpack_backend_libs})
+  target_link_options_shared_lib(xnnpack_backend)
+endif()
+
+# Vulkan backend
+if(TARGET vulkan_backend)
+  list(APPEND link_libraries vulkan_backend)
+  target_link_options_shared_lib(vulkan_backend)
+endif()
+
+# Qnn backend
+if(TARGET qnn_executorch_backend)
+  list(APPEND link_libraries qnn_executorch_backend)
+  target_link_options_shared_lib(qnn_executorch_backend)
+endif()
+
+# MPS backend
+if(TARGET mpsdelegate)
+  list(
+    APPEND
+    link_libraries
+    mpsdelegate
+    "-framework Foundation"
+    "-weak_framework MetalPerformanceShaders"
+    "-weak_framework MetalPerformanceShadersGraph"
+    "-weak_framework Metal"
+  )
+  target_link_options_shared_lib(mpsdelegate)
+endif()
+
+if(TARGET coremldelegate)
+  find_library(SQLITE_LIBRARY sqlite3)
+  list(
+    APPEND
+    link_libraries
+    coremldelegate
+    sqlite3
+    "-framework Foundation"
+    "-framework CoreML"
+    "-framework Accelerate"
+  )
+  target_link_options_shared_lib(coremldelegate)
+endif()
+
+# This one is needed for cpuinfo where it uses android specific log lib
+if(ANDROID)
+  list(APPEND link_libraries log)
+endif()
+
+add_executable(multimodal_main ${_srcs})
+if(CMAKE_BUILD_TYPE STREQUAL "Release")
+  target_link_options(multimodal_main PRIVATE "LINKER:--gc-sections,-s")
+endif()
+
+target_include_directories(multimodal_main PUBLIC ${_common_include_directories})
+target_link_libraries(multimodal_main PUBLIC multimodal_runner ${link_libraries})
+target_compile_options(multimodal_main PUBLIC ${_common_compile_options})
+
+if(APPLE)
+  target_link_options_shared_lib(executorch)
+endif()
+
+# Print all summary
+executorch_print_configuration_summary()
diff --git a/examples/models/llava/build.sh b/examples/models/llava/build.sh
@@ -0,0 +1,37 @@
+cmake                                               \
+    -DCMAKE_INSTALL_PREFIX=cmake-out                \
+    -DCMAKE_BUILD_TYPE=Debug                        \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON          \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON     \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON            \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON         \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON         \
+    -DEXECUTORCH_BUILD_XNNPACK=ON                   \
+    -DEXECUTORCH_DO_NOT_USE_CXX11_ABI=ON            \
+    -Bcmake-out .
+
+
+cmake --build cmake-out -j9 --target install --config Debug
+
+dir=examples/models/llava
+python_lib=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')
+
+cmake                                       \
+    -DCMAKE_INSTALL_PREFIX=cmake-out        \
+    -DCMAKE_BUILD_TYPE=Debug                \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON    \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_XNNPACK=ON           \
+    -DCMAKE_PREFIX_PATH="$python_lib"       \
+    -Bcmake-out/${dir}                      \
+    ${dir}
+
+
+cmake --build cmake-out/${dir} -j9 --config Debug
+
+# cmake-out/examples/models/llava/multimodal_main                                     \
+#     --tokenizer_path /data/users/larryliu/llava/tokenizer.bin                       \
+#     --model_path /data/users/larryliu/llava/llava_combined_xnnpack.pte              \
+#     --prompt "\nWhat are the things I should be cautious about when I visit here?"  \
+#     --image_path /data/users/larryliu/llava/image.pt                                \
+#     --temperature 0
diff --git a/examples/models/llava/main.cpp b/examples/models/llava/main.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/models/llava/runner/multimodal_runner.h>
+#include <gflags/gflags.h>
+#include <torch/torch.h>
+
+#if defined(ET_USE_THREADPOOL)
+#include <executorch/backends/xnnpack/threadpool/cpuinfo_utils.h>
+#include <executorch/backends/xnnpack/threadpool/threadpool.h>
+#endif
+
+DEFINE_string(
+    model_path,
+    "llava.pte",
+    "Model serialized in flatbuffer format.");
+
+DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff.");
+
+DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt.");
+
+DEFINE_string(
+    image_path,
+    "",
+    "The path to a .pt file, a serialized torch tensor for an image, longest edge resized to 336.");
+
+DEFINE_double(
+    temperature,
+    0.8f,
+    "Temperature; Default is 0.8f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic");
+
+DEFINE_int32(
+    seq_len,
+    1024,
+    "Total number of tokens to generate (prompt + output). Defaults to max_seq_len. If the number of input tokens + seq_len > max_seq_len, the output will be truncated to max_seq_len tokens.");
+
+DEFINE_int32(
+    cpu_threads,
+    -1,
+    "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
+
+int32_t main(int32_t argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  // Create a loader to get the data of the program file. There are other
+  // DataLoaders that use mmap() or point32_t to data that's already in memory,
+  // and users can create their own DataLoaders to load from arbitrary sources.
+  const char* model_path = FLAGS_model_path.c_str();
+
+  const char* tokenizer_path = FLAGS_tokenizer_path.c_str();
+
+  const char* prompt = FLAGS_prompt.c_str();
+
+  std::string image_path = FLAGS_image_path;
+
+  double temperature = FLAGS_temperature;
+
+  int32_t seq_len = FLAGS_seq_len;
+
+  int32_t cpu_threads = FLAGS_cpu_threads;
+
+#if defined(ET_USE_THREADPOOL)
+  uint32_t num_performant_cores = cpu_threads == -1
+      ? torch::executorch::cpuinfo::get_num_performant_cores()
+      : static_cast<uint32_t>(cpu_threads);
+  ET_LOG(
+      Info, "Resetting threadpool with num threads = %d", num_performant_cores);
+  if (num_performant_cores > 0) {
+    torch::executorch::threadpool::get_threadpool()->_unsafe_reset_threadpool(
+        num_performant_cores);
+  }
+#endif
+  // create llama runner
+  torch::executor::MultiModalRunner runner(
+      model_path, tokenizer_path, temperature);
+
+  // read image and resize the longest edge to 336
+  std::vector<uint8_t> image_data;
+  //   cv::Mat image = cv::imread(image_path, cv::IMREAD_COLOR);
+  //   int longest_edge = std::max(image.rows, image.cols);
+  //   float scale_factor = 336.0f / longest_edge;
+  //   cv::Size new_size(image.cols * scale_factor, image.rows * scale_factor);
+  //   cv::Mat resized_image;
+  //   cv::resize(image, resized_image, new_size);
+  //   image_data.assign(resized_image.datastart, resized_image.dataend);
+  torch::Tensor image_tensor;
+  torch::load(image_tensor, image_path); // CHW
+  ET_LOG(
+      Info,
+      "image size(0): %zu, size(1): %zu, size(2): %zu",
+      image_tensor.size(0),
+      image_tensor.size(1),
+      image_tensor.size(2));
+  image_data.assign(
+      image_tensor.data_ptr<uint8_t>(),
+      image_tensor.data_ptr<uint8_t>() + image_tensor.numel());
+  torch::executor::Image image{
+      .data = image_data,
+      .width = static_cast<int32_t>(image_tensor.size(2)),
+      .height = static_cast<int32_t>(image_tensor.size(1))};
+  // generate
+  runner.generate({image}, prompt, seq_len);
+  return 0;
+}