Skip to content

Commit

Permalink
Fix the issue that when generate the ep context model, the nodes with…
Browse files Browse the repository at this point in the history
… external initializers still point to old external data file.

Add option to specify the new external file and size threshold. For initializers below the threshold, keep it into the model.
  • Loading branch information
HectorSVC committed Jan 25, 2025
1 parent 1fc9c48 commit dbd3673
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,14 @@ static const char* const kOrtSessionOptionEpContextNodeNamePrefix = "ep.context_
// Share EP related resources across EPs
static const char* const kOrtSessionOptionShareEpContexts = "ep.share_ep_contexts";

// Use this config when dumping EP context model with an external initializers file
static const char* const kOrtSessionOptionsEpContextModelExternalInitializersFileName =
"ep.context_model_external_initializers_file_name";

// Use this config to control the minimum size of the initializer when externalizing it during serialization for EP context model
static const char* const kOrtSessionOptionsEpContextModelExternalInitializersMinSizeInBytes =
"ep.context_model_external_initializers_min_size_in_bytes";

// Gemm fastmath mode provides fp32 gemm acceleration with bfloat16 based matmul.
// Option values:
// - "0": Gemm FastMath mode is not enabled. [DEFAULT]
Expand Down
25 changes: 22 additions & 3 deletions onnxruntime/core/framework/graph_partitioner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "core/graph/function_utils.h"
#include "core/graph/graph_viewer.h"
#include "core/graph/model.h"
#include "core/graph/model_saving_options.h"
#include "core/session/onnxruntime_session_options_config_keys.h"

// uncomment this line to count non-CUDA ops in ONNX domain
Expand Down Expand Up @@ -645,6 +646,8 @@ static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_provide
static Status CreateEpContextModel(const ExecutionProviders& execution_providers,
const Graph& graph,
const std::filesystem::path& ep_context_path,
const std::filesystem::path& ep_context_ext_ini_path,
size_t size_threshold,
const logging::Logger& logger) {
InlinedVector<const Node*> all_ep_context_nodes;
for (const auto& ep : execution_providers) {
Expand Down Expand Up @@ -676,6 +679,15 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Both ep_context_path and model_path are empty");
}

std::filesystem::path context_cache_ext_ini_path;
if (!ep_context_ext_ini_path.empty()) {
context_cache_ext_ini_path = ep_context_ext_ini_path;
} else if (!model_path.empty()) {
context_cache_ext_ini_path = model_path.filename().native() + ORT_TSTR("_ext.bin");
} else {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Both ep_context_path and model_path are empty");
}

if (std::filesystem::exists(context_cache_path)) {
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to generate EP context model since the file '",
context_cache_path, "' exist already.");
Expand Down Expand Up @@ -727,7 +739,9 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers
}
}

ORT_RETURN_IF_ERROR(Model::Save(ep_context_model, context_cache_path));
ModelSavingOptions model_saving_options{size_threshold};
ORT_RETURN_IF_ERROR(Model::SaveWithExternalInitializers(ep_context_model, context_cache_path,
context_cache_ext_ini_path, model_saving_options));

return Status::OK();
}
Expand Down Expand Up @@ -993,9 +1007,14 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode, providers_, kernel_registry_mgr_, logger));

bool ep_context_enabled = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
if (ep_context_enabled) {
ORT_RETURN_IF_ERROR(CreateEpContextModel(providers_, graph, ep_context_path, logger));
std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
std::string external_ini_file_name = config_options.GetConfigOrDefault(kOrtSessionOptionsEpContextModelExternalInitializersFileName, "");
const size_t model_external_initializers_min_size_in_bytes =
ParseStringWithClassicLocale<size_t>(config_options.GetConfigOrDefault(
kOrtSessionOptionsEpContextModelExternalInitializersMinSizeInBytes, "1024000"));
ORT_RETURN_IF_ERROR(CreateEpContextModel(providers_, graph, ep_context_path, external_ini_file_name,
model_external_initializers_min_size_in_bytes, logger));
}
#else
ORT_UNUSED_PARAMETER(config_options);
Expand Down
68 changes: 68 additions & 0 deletions onnxruntime/core/framework/tensorprotoutils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,74 @@ void SetRawDataInTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto, std::str
tensor_proto.set_raw_data(std::move(param));
}

void PackDataIntoTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto,
const std::vector<uint8_t>& unpacked_data,
int64_t data_size) {
switch (tensor_proto.data_type()) {
case TensorProto_DataType_FLOAT: {
const float* data_f = reinterpret_cast<const float*>(unpacked_data.data());
tensor_proto.mutable_float_data()->Add(data_f, data_f + data_size);
break;
}

case TensorProto_DataType_INT32: {
const int32_t* data_i32 = reinterpret_cast<const int32_t*>(unpacked_data.data());
tensor_proto.mutable_int32_data()->Add(data_i32, data_i32 + data_size);
break;
}

case TensorProto_DataType_UINT32: {
const uint32_t* data_ui32 = reinterpret_cast<const uint32_t*>(unpacked_data.data());
tensor_proto.mutable_int32_data()->Add(data_ui32, data_ui32 + data_size);
break;
}

case TensorProto_DataType_UINT8:
case TensorProto_DataType_INT8: {
data_size = data_size * sizeof(uint8_t) / sizeof(int32_t);
const int32_t* data_i8 = reinterpret_cast<const int32_t*>(unpacked_data.data());
tensor_proto.mutable_int32_data()->Add(data_i8, data_i8 + data_size);
break;
}

case TensorProto_DataType_UINT16:
case TensorProto_DataType_INT16:
case TensorProto_DataType_FLOAT16:
case TensorProto_DataType_BFLOAT16: {
data_size = data_size * sizeof(uint16_t) / sizeof(int32_t);
const int32_t* data_16 = reinterpret_cast<const int32_t*>(unpacked_data.data());
tensor_proto.mutable_int32_data()->Add(data_16, data_16 + data_size);
break;
}

case TensorProto_DataType_UINT64: {
const uint64_t* data_ui64 = reinterpret_cast<const uint64_t*>(unpacked_data.data());
tensor_proto.mutable_uint64_data()->Add(data_ui64, data_ui64 + data_size);
break;
}

case TensorProto_DataType_INT64: {
const int64_t* data_i64 = reinterpret_cast<const int64_t*>(unpacked_data.data());
tensor_proto.mutable_int64_data()->Add(data_i64, data_i64 + data_size);
break;
}

case TensorProto_DataType_DOUBLE: {
const double* data_d = reinterpret_cast<const double*>(unpacked_data.data());
tensor_proto.mutable_double_data()->Add(data_d, data_d + data_size);
break;
}

case TensorProto_DataType_COMPLEX64: {
const float* data_c = reinterpret_cast<const float*>(unpacked_data.data());
tensor_proto.mutable_float_data()->Add(data_c, data_c + data_size);
break;
}
}

return;
}

void ConvertRawDataInTensorProto(TensorProto* tensor) {
size_t element_size = 1;
char* bytes = NULL;
Expand Down
11 changes: 11 additions & 0 deletions onnxruntime/core/framework/tensorprotoutils.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,17 @@ void SetRawDataInTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto, T1* raw_
* @returns None
*/
void SetRawDataInTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto, std::string&& param);

/**
* Pack data into TensorProto
* @param tensor_proto given initializer tensor
* @param unpacked_data unpacked data in 8bit vector
* @param data_size size of data in original data type, size got from shape/dims
* @returns None
*/
void PackDataIntoTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto,
const std::vector<uint8_t>& unpacked_data,
int64_t data_size);
} // namespace utils
} // namespace onnxruntime

Expand Down
16 changes: 16 additions & 0 deletions onnxruntime/core/graph/graph.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4175,6 +4175,15 @@ Status Graph::AddExternalInitializersToGraphProtoImpl(
size_t tensor_bytes_size = raw_data.size();
if (tensor_bytes_size < model_saving_options.initializer_size_threshold) {
*output_proto = initializer;
// Data with size above the threshold is written into the new external initializer file
// Data with size below the threshold should be kept inside the new model file
// instead of leaving it in the old external initializer file for the old Onnx file
if (initializer.data_location() == TensorProto_DataLocation_EXTERNAL) {
TensorShape shape(initializer.dims());
auto data_size = shape.Size();
utils::PackDataIntoTensorProto(*output_proto, raw_data, data_size);
output_proto->clear_data_location();
}
if (process_prepacks) {
// These pre-packs will reside in memory
processed_weights.insert(initializer.name());
Expand Down Expand Up @@ -4263,6 +4272,7 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(

// Create the external file.
std::ofstream external_stream(modified_external_file_path, std::ofstream::out | std::ofstream::binary);
auto const external_empty_pos = external_stream.tellp();
ORT_ENFORCE(external_stream.is_open(), "Failed to open for writing:", modified_external_file_path);
int64_t external_offset = 0;

Expand All @@ -4275,6 +4285,12 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(
ORT_THROW("Failed to flush file with external initializers: ", modified_external_file_path);
}

// Delete if the external data file is empty
if (external_empty_pos == external_stream.tellp()) {
external_stream.close();
std::remove(modified_external_file_path.string().c_str());
}

return result;
}

Expand Down

0 comments on commit dbd3673

Please sign in to comment.