Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SERVE][CPP][Android] add native executable program to benchmark models #2987

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
# tvm runtime config: minimize runtime components
set(USE_RPC OFF)
set(USE_MICRO OFF)
# set(USE_VULKAN ON)
set(USE_GRAPH_EXECUTOR OFF)
set(USE_GRAPH_EXECUTOR_DEBUG OFF)
set(USE_AOT_EXECUTOR OFF)
Expand Down Expand Up @@ -175,3 +176,16 @@ else()
LIBRARY DESTINATION lib${LIB_SUFFIX}
)
endif()

add_executable(llm_benchmark cpp/llm_benchmark.cpp)

target_include_directories(llm_benchmark PRIVATE
${TVM_SOURCE_DIR}/include
${TVM_SOURCE_DIR}/3rdparty/dlpack/include
${TVM_SOURCE_DIR}/3rdparty/dmlc-core/include
${TVM_SOURCE_DIR}/3rdparty/picojson
${TOKENZIER_CPP_PATH}/include
)
target_link_libraries(llm_benchmark PUBLIC mlc_llm_module)

# target_link_libraries(tvm PRIVATE log)
254 changes: 122 additions & 132 deletions cpp/json_ffi/json_ffi_engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -150,150 +150,140 @@ void JSONFFIEngine::ExitBackgroundLoop() { this->engine_->ExitBackgroundLoop();

JSONFFIEngine::~JSONFFIEngine() { this->ExitBackgroundLoop(); }

class JSONFFIEngineImpl : public JSONFFIEngine, public ModuleNode {
public:
TVM_MODULE_VTABLE_BEGIN("mlc.json_ffi");
TVM_MODULE_VTABLE_ENTRY("init_background_engine", &JSONFFIEngineImpl::InitBackgroundEngine);
TVM_MODULE_VTABLE_ENTRY("reload", &JSONFFIEngineImpl::Reload);
TVM_MODULE_VTABLE_ENTRY("unload", &JSONFFIEngineImpl::Unload);
TVM_MODULE_VTABLE_ENTRY("reset", &JSONFFIEngineImpl::Reset);
TVM_MODULE_VTABLE_ENTRY("chat_completion", &JSONFFIEngineImpl::ChatCompletion);
TVM_MODULE_VTABLE_ENTRY("abort", &JSONFFIEngineImpl::Abort);
TVM_MODULE_VTABLE_ENTRY("get_last_error", &JSONFFIEngineImpl::GetLastError);
TVM_MODULE_VTABLE_ENTRY("run_background_loop", &JSONFFIEngineImpl::RunBackgroundLoop);
TVM_MODULE_VTABLE_ENTRY("run_background_stream_back_loop",
&JSONFFIEngineImpl::RunBackgroundStreamBackLoop);
TVM_MODULE_VTABLE_ENTRY("exit_background_loop", &JSONFFIEngineImpl::ExitBackgroundLoop);
TVM_MODULE_VTABLE_END();

void InitBackgroundEngine(int device_type, int device_id,
Optional<PackedFunc> request_stream_callback) {
DLDevice device{static_cast<DLDeviceType>(device_type), device_id};
this->device_ = device;
CHECK(request_stream_callback.defined())
<< "JSONFFIEngine requires request stream callback function, but it is not given.";
this->request_stream_callback_ = request_stream_callback.value();

auto frequest_stream_callback_wrapper = [this](TVMArgs args, TVMRetValue* ret) {
ICHECK_EQ(args.size(), 1);
Array<RequestStreamOutput> delta_outputs = args[0];
std::string responses = this->GetResponseFromStreamOutput(delta_outputs);
this->request_stream_callback_(responses);
};

request_stream_callback = PackedFunc(frequest_stream_callback_wrapper);
this->engine_->InitThreadedEngine(device, std::move(request_stream_callback), NullOpt);
}
void JSONFFIEngineImpl::InitBackgroundEngine(int device_type, int device_id,
Optional<PackedFunc> request_stream_callback) {
DLDevice device{static_cast<DLDeviceType>(device_type), device_id};
this->device_ = device;
CHECK(request_stream_callback.defined())
<< "JSONFFIEngine requires request stream callback function, but it is not given.";
this->request_stream_callback_ = request_stream_callback.value();

auto frequest_stream_callback_wrapper = [this](TVMArgs args, TVMRetValue* ret) {
ICHECK_EQ(args.size(), 1);
Array<RequestStreamOutput> delta_outputs = args[0];
std::string responses = this->GetResponseFromStreamOutput(delta_outputs);
this->request_stream_callback_(responses);
};

request_stream_callback = PackedFunc(frequest_stream_callback_wrapper);
this->engine_->InitThreadedEngine(device, std::move(request_stream_callback), NullOpt);
}

void Reload(String engine_config_json_str) {
this->engine_->Reload(engine_config_json_str);
this->default_generation_config_ = this->engine_->GetDefaultGenerationConfig();
auto engine_config = this->engine_->GetCompleteEngineConfig();

// Load conversation template.
Result<picojson::object> model_config_json =
serve::Model::LoadModelConfig(engine_config->model);
CHECK(model_config_json.IsOk()) << model_config_json.UnwrapErr();
const picojson::object& model_config_json_unwrapped = model_config_json.Unwrap();
Result<Conversation> conv_template = Conversation::FromJSON(
json::Lookup<picojson::object>(model_config_json_unwrapped, "conv_template"));
CHECK(!conv_template.IsErr()) << "Invalid conversation template JSON: "
<< conv_template.UnwrapErr();
this->conv_template_ = conv_template.Unwrap();
this->model_config_ = ModelConfig::FromJSON(
json::Lookup<picojson::object>(model_config_json_unwrapped, "model_config"));
this->tokenizer_ = Tokenizer::FromPath(engine_config->model);
}
void JSONFFIEngineImpl::Reload(String engine_config_json_str) {
this->engine_->Reload(engine_config_json_str);
this->default_generation_config_ = this->engine_->GetDefaultGenerationConfig();
auto engine_config = this->engine_->GetCompleteEngineConfig();

// Load conversation template.
Result<picojson::object> model_config_json = serve::Model::LoadModelConfig(engine_config->model);
CHECK(model_config_json.IsOk()) << model_config_json.UnwrapErr();
const picojson::object& model_config_json_unwrapped = model_config_json.Unwrap();
Result<Conversation> conv_template = Conversation::FromJSON(
json::Lookup<picojson::object>(model_config_json_unwrapped, "conv_template"));
CHECK(!conv_template.IsErr()) << "Invalid conversation template JSON: "
<< conv_template.UnwrapErr();
this->conv_template_ = conv_template.Unwrap();
this->model_config_ = ModelConfig::FromJSON(
json::Lookup<picojson::object>(model_config_json_unwrapped, "model_config"));
this->tokenizer_ = Tokenizer::FromPath(engine_config->model);
}

void Unload() { this->engine_->Unload(); }

void Reset() { this->engine_->Reset(); }

void RunBackgroundLoop() { this->engine_->RunBackgroundLoop(); }

void RunBackgroundStreamBackLoop() { this->engine_->RunBackgroundStreamBackLoop(); }

String GetResponseFromStreamOutput(Array<RequestStreamOutput> delta_outputs) {
picojson::array json_response_arr;
for (const auto& delta_output : delta_outputs) {
std::string request_id = delta_output->request_id;
auto request_state_it = request_map_.find(request_id);
if (request_state_it == request_map_.end()) continue;
RequestState& rstate = request_state_it->second;

// build the final usage messages
// invariant, we can always let other messages to come first
// then the final usage messages, as final usage is always last
if (delta_output->request_final_usage_json_str.defined()) {
ChatCompletionStreamResponse response;
response.id = request_id;
response.model = rstate.model;
response.system_fingerprint = "";
std::string usage_json_str = delta_output->request_final_usage_json_str.value();
picojson::value usage_json;
std::string err = picojson::parse(usage_json, usage_json_str);
if (!err.empty()) {
err_ = err;
} else {
response.usage = usage_json;
}
json_response_arr.push_back(picojson::value(response.AsJSON()));
request_map_.erase(request_state_it);
continue;
}
ICHECK_NE(delta_output->group_finish_reason.size(), 0);
ICHECK_EQ(delta_output->group_delta_token_ids.size(),
delta_output->group_finish_reason.size());
ICHECK_EQ(delta_output->group_delta_token_ids.size(), rstate.streamer.size());
void JSONFFIEngineImpl::Unload() { this->engine_->Unload(); }

void JSONFFIEngineImpl::Reset() { this->engine_->Reset(); }

void JSONFFIEngineImpl::RunBackgroundLoop() { this->engine_->RunBackgroundLoop(); }

void JSONFFIEngineImpl::RunBackgroundStreamBackLoop() {
this->engine_->RunBackgroundStreamBackLoop();
}

String JSONFFIEngineImpl::GetResponseFromStreamOutput(Array<RequestStreamOutput> delta_outputs) {
picojson::array json_response_arr;
for (const auto& delta_output : delta_outputs) {
std::string request_id = delta_output->request_id;
auto request_state_it = request_map_.find(request_id);
if (request_state_it == request_map_.end()) continue;
RequestState& rstate = request_state_it->second;

// build the final usage messages
// invariant, we can always let other messages to come first
// then the final usage messages, as final usage is always last
if (delta_output->request_final_usage_json_str.defined()) {
ChatCompletionStreamResponse response;
response.id = request_id;
response.model = rstate.model;
response.system_fingerprint = "";

for (size_t i = 0; i < delta_output->group_finish_reason.size(); ++i) {
// choice
ChatCompletionStreamResponseChoice choice;
Optional<String> finish_reason = delta_output->group_finish_reason[i];
if (finish_reason.defined()) {
if (finish_reason.value() == "stop") {
choice.finish_reason = FinishReason::stop;
} else if (finish_reason.value() == "length") {
choice.finish_reason = FinishReason::length;
} else if (finish_reason.value() == "tool_calls") {
choice.finish_reason = FinishReason::tool_calls;
} else if (finish_reason.value() == "error") {
choice.finish_reason = FinishReason::error;
}
} else {
choice.finish_reason = std::nullopt;
}
choice.index = static_cast<int>(i);
ChatCompletionMessage delta;
// Size of delta_output->group_delta_token_ids Array should be 1
const IntTuple& delta_token_ids = delta_output->group_delta_token_ids[i];
std::vector<int32_t> delta_token_ids_vec(delta_token_ids.begin(), delta_token_ids.end());
std::string content = rstate.streamer[i]->Put(delta_token_ids_vec);
if (finish_reason.defined()) {
content += rstate.streamer[i]->Finish();
}
if (!content.empty()) {
delta.content = content;
}
delta.role = "assistant";
choice.delta = delta;
if (!choice.delta.content.IsNull() || choice.finish_reason.has_value()) {
response.choices.push_back(choice);
std::string usage_json_str = delta_output->request_final_usage_json_str.value();
picojson::value usage_json;
std::string err = picojson::parse(usage_json, usage_json_str);
if (!err.empty()) {
err_ = err;
} else {
response.usage = usage_json;
}
json_response_arr.push_back(picojson::value(response.AsJSON()));
request_map_.erase(request_state_it);
continue;
}
ICHECK_NE(delta_output->group_finish_reason.size(), 0);
ICHECK_EQ(delta_output->group_delta_token_ids.size(), delta_output->group_finish_reason.size());
ICHECK_EQ(delta_output->group_delta_token_ids.size(), rstate.streamer.size());

ChatCompletionStreamResponse response;
response.id = request_id;
response.model = rstate.model;
response.system_fingerprint = "";

for (size_t i = 0; i < delta_output->group_finish_reason.size(); ++i) {
// choice
ChatCompletionStreamResponseChoice choice;
Optional<String> finish_reason = delta_output->group_finish_reason[i];
if (finish_reason.defined()) {
if (finish_reason.value() == "stop") {
choice.finish_reason = FinishReason::stop;
} else if (finish_reason.value() == "length") {
choice.finish_reason = FinishReason::length;
} else if (finish_reason.value() == "tool_calls") {
choice.finish_reason = FinishReason::tool_calls;
} else if (finish_reason.value() == "error") {
choice.finish_reason = FinishReason::error;
}
} else {
choice.finish_reason = std::nullopt;
}
choice.index = static_cast<int>(i);
ChatCompletionMessage delta;
// Size of delta_output->group_delta_token_ids Array should be 1
const IntTuple& delta_token_ids = delta_output->group_delta_token_ids[i];
std::vector<int32_t> delta_token_ids_vec(delta_token_ids.begin(), delta_token_ids.end());
std::string content = rstate.streamer[i]->Put(delta_token_ids_vec);
if (finish_reason.defined()) {
content += rstate.streamer[i]->Finish();
}
if (!content.empty()) {
delta.content = content;
}
// if it is not the usage block, choices cannot be empty
if (!response.choices.empty()) {
json_response_arr.push_back(picojson::value(response.AsJSON()));
delta.role = "assistant";
choice.delta = delta;
if (!choice.delta.content.IsNull() || choice.finish_reason.has_value()) {
response.choices.push_back(choice);
}
}
return picojson::value(json_response_arr).serialize();
// if it is not the usage block, choices cannot be empty
if (!response.choices.empty()) {
json_response_arr.push_back(picojson::value(response.AsJSON()));
}
}
};
return picojson::value(json_response_arr).serialize();
}

Module JSONFFIEngineImpl::Create() {
auto n = make_object<JSONFFIEngineImpl>();
return Module(n);
}

// TVM_REGISTER_GLOBAL("mlc.json_ffi.CreateJSONFFIEngine").set_body_typed(JSONFFIEngineImpl::Create);

TVM_REGISTER_GLOBAL("mlc.json_ffi.CreateJSONFFIEngine").set_body_typed([]() {
return Module(make_object<JSONFFIEngineImpl>());
Expand Down
33 changes: 33 additions & 0 deletions cpp/json_ffi/json_ffi_engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,39 @@ class JSONFFIEngine {
std::unordered_map<String, RequestState> request_map_;
};

class JSONFFIEngineImpl : public JSONFFIEngine, public ModuleNode {
public:
static Module Create();

TVM_MODULE_VTABLE_BEGIN("mlc.json_ffi");
TVM_MODULE_VTABLE_ENTRY("init_background_engine", &JSONFFIEngineImpl::InitBackgroundEngine);
TVM_MODULE_VTABLE_ENTRY("reload", &JSONFFIEngineImpl::Reload);
TVM_MODULE_VTABLE_ENTRY("unload", &JSONFFIEngineImpl::Unload);
TVM_MODULE_VTABLE_ENTRY("reset", &JSONFFIEngineImpl::Reset);
TVM_MODULE_VTABLE_ENTRY("chat_completion", &JSONFFIEngineImpl::ChatCompletion);
TVM_MODULE_VTABLE_ENTRY("abort", &JSONFFIEngineImpl::Abort);
TVM_MODULE_VTABLE_ENTRY("get_last_error", &JSONFFIEngineImpl::GetLastError);
TVM_MODULE_VTABLE_ENTRY("run_background_loop", &JSONFFIEngineImpl::RunBackgroundLoop);
TVM_MODULE_VTABLE_ENTRY("run_background_stream_back_loop",
&JSONFFIEngineImpl::RunBackgroundStreamBackLoop);
TVM_MODULE_VTABLE_ENTRY("exit_background_loop", &JSONFFIEngineImpl::ExitBackgroundLoop);
TVM_MODULE_VTABLE_END();

void InitBackgroundEngine(int device_type, int device_id,
Optional<PackedFunc> request_stream_callback);
void Reload(String engine_config_json_str);
void Unload();
void Reset();
void RunBackgroundLoop();
void RunBackgroundStreamBackLoop();

// Implement the TVM_MODULE_VTABLE
// TVM_DEFINE_OBJECT_REF_METHODS(JSONFFIEngineImpl, ModuleNode, JSONFFIEngineImplNode);

private:
String GetResponseFromStreamOutput(Array<RequestStreamOutput> delta_outputs);
};

} // namespace json_ffi
} // namespace llm
} // namespace mlc
Expand Down
Loading