Skip to content

Commit

Permalink
[llama][8/N] Add Multimodal Runner
Browse files Browse the repository at this point in the history
ghstack-source-id: d72da351e7eb7ca69eeaf9241bc604aeaf3bc139
Pull Request resolved: #4354
  • Loading branch information
larryliu0820 committed Jul 31, 2024
1 parent d0e65a8 commit 526a461
Show file tree
Hide file tree
Showing 9 changed files with 1,153 additions and 1 deletion.
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,10 @@ if(EXECUTORCH_ENABLE_EVENT_TRACER)
add_definitions(-DET_EVENT_TRACER_ENABLED)
endif()

option(EXECUTORCH_DO_NOT_USE_CXX11_ABI "Define _GLIBCXX_USE_CXX11_ABI=0 if ON" OFF)
if(EXECUTORCH_DO_NOT_USE_CXX11_ABI)
add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
endif()
# -ffunction-sections -fdata-sections: breaks function and data into sections so
# they can be properly gc'd. -s: strip symbol. -fno-exceptions -fno-rtti:
# disables exceptions and runtime type.
Expand Down
234 changes: 234 additions & 0 deletions examples/models/llava/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

#
# Simple CMake build system for multimodal runner.
#
# ### Editing this file ###
#
# This file should be formatted with
# ~~~
# cmake-format -i CMakeLists.txt
# ~~~
# It should also be cmake-lint clean.
#
cmake_minimum_required(VERSION 3.19)
project(multimodal)

# Duplicating options as root CMakeLists.txt
option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" OFF)

option(EXECUTORCH_USE_TIKTOKEN "Use Tiktoken as a tokenizer" OFF)

include(CMakeDependentOption)
#
# pthreadpool: build pthreadpool library. Disable on unsupported platforms
#
cmake_dependent_option(
EXECUTORCH_BUILD_PTHREADPOOL "Build pthreadpool library." ON
"NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF
)
#
# cpuinfo: build cpuinfo library. Disable on unsupported platforms
#
cmake_dependent_option(
EXECUTORCH_BUILD_CPUINFO "Build cpuinfo library." ON
"NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF
)

if(NOT PYTHON_EXECUTABLE)
set(PYTHON_EXECUTABLE python3)
endif()

set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
set(TORCH_ROOT ${EXECUTORCH_ROOT}/third-party/pytorch)

include(${EXECUTORCH_ROOT}/build/Utils.cmake)

if(NOT PYTHON_EXECUTABLE)
resolve_python_executable()
endif()

if(NOT CMAKE_CXX_STANDARD)
set(CMAKE_CXX_STANDARD 17)
# Can't set to 11 due to executor_runner.cpp make_unique
endif()

if(CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
set(CMAKE_TOOLCHAIN_IOS ON)
else()
set(CMAKE_TOOLCHAIN_IOS OFF)
endif()

set(_common_compile_options -Wno-deprecated-declarations -fPIC)

# Let files say "include <executorch/path/to/header.h>".
set(_common_include_directories ${EXECUTORCH_ROOT}/..)

# For some reason android build is not able to find where gflags is and hence
# cannot find corresponding .cmake file
set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
find_package(gflags REQUIRED)

find_package(Torch CONFIG REQUIRED)
add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)

#
# llama_main: test binary to run llama, with tokenizer and sampler integrated
#

# find `executorch` libraries Same as for gflags
set(executorch_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../lib/cmake/ExecuTorch)
find_package(executorch CONFIG REQUIRED)
if(CMAKE_TOOLCHAIN_IOS OR ANDROID)
target_link_options_shared_lib(executorch)
endif()

# custom ops library
if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../llama2/custom_ops custom_ops)
endif()

# multimodal_runner library
add_subdirectory(runner)
if(EXECUTORCH_USE_TIKTOKEN)
# find RE2 for tokenizer
set(ABSL_ENABLE_INSTALL ON)
set(ABSL_PROPAGATE_CXX_STD ON)
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/third-party/abseil-cpp
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/third-party/re2
${CMAKE_CURRENT_BINARY_DIR}/re2
)
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
target_link_libraries(multimodal_runner PUBLIC re2::re2)
endif()

set(link_libraries gflags torch)
set(_srcs main.cpp)

if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
list(
APPEND
link_libraries
optimized_native_cpu_ops_lib
optimized_kernels
portable_kernels
cpublas
eigen_blas
)
target_link_options_shared_lib(optimized_native_cpu_ops_lib)
else()
list(APPEND link_libraries portable_ops_lib portable_kernels)
target_link_options_shared_lib(portable_ops_lib)
endif()

# quantized_ops_lib: Register quantized op kernels into the runtime
target_link_options_shared_lib(quantized_ops_lib)
list(APPEND link_libraries quantized_kernels quantized_ops_lib)

if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
target_link_options_shared_lib(custom_ops)
list(APPEND link_libraries custom_ops)
endif()

set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack)
# Extra compile option and include dir for pthreadpool
if(EXECUTORCH_BUILD_PTHREADPOOL)
list(APPEND _common_compile_options -DET_USE_THREADPOOL)
list(APPEND link_libraries pthreadpool)
# These 2 source files are included in xnnpack_backend
if(NOT TARGET xnnpack_backend)
list(APPEND _srcs ${XNNPACK_ROOT}/threadpool/threadpool.cpp
${XNNPACK_ROOT}/threadpool/threadpool_guard.cpp
)
endif()
list(APPEND _common_include_directories
${XNNPACK_ROOT}/third-party/pthreadpool/include
)
endif()

# Extra sources for cpuinfo
if(EXECUTORCH_BUILD_CPUINFO)
list(APPEND link_libraries cpuinfo)
list(APPEND _srcs ${XNNPACK_ROOT}/threadpool/cpuinfo_utils.cpp)
list(APPEND _common_include_directories
${XNNPACK_ROOT}/third-party/cpuinfo/include
)
endif()

# XNNPACK
if(TARGET xnnpack_backend)
set(xnnpack_backend_libs xnnpack_backend XNNPACK)
list(APPEND link_libraries ${xnnpack_backend_libs})
target_link_options_shared_lib(xnnpack_backend)
endif()

# Vulkan backend
if(TARGET vulkan_backend)
list(APPEND link_libraries vulkan_backend)
target_link_options_shared_lib(vulkan_backend)
endif()

# Qnn backend
if(TARGET qnn_executorch_backend)
list(APPEND link_libraries qnn_executorch_backend)
target_link_options_shared_lib(qnn_executorch_backend)
endif()

# MPS backend
if(TARGET mpsdelegate)
list(
APPEND
link_libraries
mpsdelegate
"-framework Foundation"
"-weak_framework MetalPerformanceShaders"
"-weak_framework MetalPerformanceShadersGraph"
"-weak_framework Metal"
)
target_link_options_shared_lib(mpsdelegate)
endif()

if(TARGET coremldelegate)
find_library(SQLITE_LIBRARY sqlite3)
list(
APPEND
link_libraries
coremldelegate
sqlite3
"-framework Foundation"
"-framework CoreML"
"-framework Accelerate"
)
target_link_options_shared_lib(coremldelegate)
endif()

# This one is needed for cpuinfo where it uses android specific log lib
if(ANDROID)
list(APPEND link_libraries log)
endif()

add_executable(multimodal_main ${_srcs})
if(CMAKE_BUILD_TYPE STREQUAL "Release")
target_link_options(multimodal_main PRIVATE "LINKER:--gc-sections,-s")
endif()

target_include_directories(multimodal_main PUBLIC ${_common_include_directories})
target_link_libraries(multimodal_main PUBLIC multimodal_runner ${link_libraries})
target_compile_options(multimodal_main PUBLIC ${_common_compile_options})

if(APPLE)
target_link_options_shared_lib(executorch)
endif()

# Print all summary
executorch_print_configuration_summary()
37 changes: 37 additions & 0 deletions examples/models/llava/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
cmake \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DCMAKE_BUILD_TYPE=Debug \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-DEXECUTORCH_BUILD_XNNPACK=ON \
-DEXECUTORCH_DO_NOT_USE_CXX11_ABI=ON \
-Bcmake-out .


cmake --build cmake-out -j9 --target install --config Debug

dir=examples/models/llava
python_lib=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')

cmake \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DCMAKE_BUILD_TYPE=Debug \
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_XNNPACK=ON \
-DCMAKE_PREFIX_PATH="$python_lib" \
-Bcmake-out/${dir} \
${dir}


cmake --build cmake-out/${dir} -j9 --config Debug

# cmake-out/examples/models/llava/multimodal_main \
# --tokenizer_path /data/users/larryliu/llava/tokenizer.bin \
# --model_path /data/users/larryliu/llava/llava_combined_xnnpack.pte \
# --prompt "\nWhat are the things I should be cautious about when I visit here?" \
# --image_path /data/users/larryliu/llava/image.pt \
# --temperature 0
109 changes: 109 additions & 0 deletions examples/models/llava/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/examples/models/llava/runner/multimodal_runner.h>
#include <gflags/gflags.h>
#include <torch/torch.h>

#if defined(ET_USE_THREADPOOL)
#include <executorch/backends/xnnpack/threadpool/cpuinfo_utils.h>
#include <executorch/backends/xnnpack/threadpool/threadpool.h>
#endif

DEFINE_string(
model_path,
"llava.pte",
"Model serialized in flatbuffer format.");

DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff.");

DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt.");

DEFINE_string(
image_path,
"",
"The path to a .pt file, a serialized torch tensor for an image, longest edge resized to 336.");

DEFINE_double(
temperature,
0.8f,
"Temperature; Default is 0.8f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic");

DEFINE_int32(
seq_len,
1024,
"Total number of tokens to generate (prompt + output). Defaults to max_seq_len. If the number of input tokens + seq_len > max_seq_len, the output will be truncated to max_seq_len tokens.");

DEFINE_int32(
cpu_threads,
-1,
"Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");

int32_t main(int32_t argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);

// Create a loader to get the data of the program file. There are other
// DataLoaders that use mmap() or point32_t to data that's already in memory,
// and users can create their own DataLoaders to load from arbitrary sources.
const char* model_path = FLAGS_model_path.c_str();

const char* tokenizer_path = FLAGS_tokenizer_path.c_str();

const char* prompt = FLAGS_prompt.c_str();

std::string image_path = FLAGS_image_path;

double temperature = FLAGS_temperature;

int32_t seq_len = FLAGS_seq_len;

int32_t cpu_threads = FLAGS_cpu_threads;

#if defined(ET_USE_THREADPOOL)
uint32_t num_performant_cores = cpu_threads == -1
? torch::executorch::cpuinfo::get_num_performant_cores()
: static_cast<uint32_t>(cpu_threads);
ET_LOG(
Info, "Resetting threadpool with num threads = %d", num_performant_cores);
if (num_performant_cores > 0) {
torch::executorch::threadpool::get_threadpool()->_unsafe_reset_threadpool(
num_performant_cores);
}
#endif
// create llama runner
torch::executor::MultiModalRunner runner(
model_path, tokenizer_path, temperature);

// read image and resize the longest edge to 336
std::vector<uint8_t> image_data;
// cv::Mat image = cv::imread(image_path, cv::IMREAD_COLOR);
// int longest_edge = std::max(image.rows, image.cols);
// float scale_factor = 336.0f / longest_edge;
// cv::Size new_size(image.cols * scale_factor, image.rows * scale_factor);
// cv::Mat resized_image;
// cv::resize(image, resized_image, new_size);
// image_data.assign(resized_image.datastart, resized_image.dataend);
torch::Tensor image_tensor;
torch::load(image_tensor, image_path); // CHW
ET_LOG(
Info,
"image size(0): %zu, size(1): %zu, size(2): %zu",
image_tensor.size(0),
image_tensor.size(1),
image_tensor.size(2));
image_data.assign(
image_tensor.data_ptr<uint8_t>(),
image_tensor.data_ptr<uint8_t>() + image_tensor.numel());
torch::executor::Image image{
.data = image_data,
.width = static_cast<int32_t>(image_tensor.size(2)),
.height = static_cast<int32_t>(image_tensor.size(1))};
// generate
runner.generate({image}, prompt, seq_len);
return 0;
}
Loading

0 comments on commit 526a461

Please sign in to comment.