Fix the issue that when generate the ep context model, the nodes with…

… external initializers still point to old external data file. Add option to specify the new external file and size threshold. For initializers below the threshold, keep it into the model.
microsoft · Jan 25, 2025 · dbd3673 · dbd3673
1 parent 1fc9c48
commit dbd3673
Show file tree

Hide file tree

Showing 5 changed files with 125 additions and 3 deletions.
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -283,6 +283,14 @@ static const char* const kOrtSessionOptionEpContextNodeNamePrefix = "ep.context_
 // Share EP related resources across EPs
 static const char* const kOrtSessionOptionShareEpContexts = "ep.share_ep_contexts";
 
+// Use this config when dumping EP context model with an external initializers file
+static const char* const kOrtSessionOptionsEpContextModelExternalInitializersFileName =
+    "ep.context_model_external_initializers_file_name";
+
+// Use this config to control the minimum size of the initializer when externalizing it during serialization for EP context model
+static const char* const kOrtSessionOptionsEpContextModelExternalInitializersMinSizeInBytes =
+    "ep.context_model_external_initializers_min_size_in_bytes";
+
 // Gemm fastmath mode provides fp32 gemm acceleration with bfloat16 based matmul.
 // Option values:
 // - "0": Gemm FastMath mode is not enabled. [DEFAULT]

diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
@@ -16,6 +16,7 @@
 #include "core/graph/function_utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
+#include "core/graph/model_saving_options.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 
 // uncomment this line to count non-CUDA ops in ONNX domain
@@ -645,6 +646,8 @@ static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_provide
 static Status CreateEpContextModel(const ExecutionProviders& execution_providers,
                                    const Graph& graph,
                                    const std::filesystem::path& ep_context_path,
+                                   const std::filesystem::path& ep_context_ext_ini_path,
+                                   size_t size_threshold,
                                    const logging::Logger& logger) {
   InlinedVector<const Node*> all_ep_context_nodes;
   for (const auto& ep : execution_providers) {
@@ -676,6 +679,15 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Both ep_context_path and model_path are empty");
   }
 
+  std::filesystem::path context_cache_ext_ini_path;
+  if (!ep_context_ext_ini_path.empty()) {
+    context_cache_ext_ini_path = ep_context_ext_ini_path;
+  } else if (!model_path.empty()) {
+    context_cache_ext_ini_path = model_path.filename().native() + ORT_TSTR("_ext.bin");
+  } else {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Both ep_context_path and model_path are empty");
+  }
+
   if (std::filesystem::exists(context_cache_path)) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to generate EP context model since the file '",
                            context_cache_path, "' exist already.");
@@ -727,7 +739,9 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers
     }
   }
 
-  ORT_RETURN_IF_ERROR(Model::Save(ep_context_model, context_cache_path));
+  ModelSavingOptions model_saving_options{size_threshold};
+  ORT_RETURN_IF_ERROR(Model::SaveWithExternalInitializers(ep_context_model, context_cache_path,
+                                                          context_cache_ext_ini_path, model_saving_options));
 
   return Status::OK();
 }
@@ -993,9 +1007,14 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
     ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode, providers_, kernel_registry_mgr_, logger));
 
     bool ep_context_enabled = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
-    std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
     if (ep_context_enabled) {
-      ORT_RETURN_IF_ERROR(CreateEpContextModel(providers_, graph, ep_context_path, logger));
+      std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+      std::string external_ini_file_name = config_options.GetConfigOrDefault(kOrtSessionOptionsEpContextModelExternalInitializersFileName, "");
+      const size_t model_external_initializers_min_size_in_bytes =
+          ParseStringWithClassicLocale<size_t>(config_options.GetConfigOrDefault(
+              kOrtSessionOptionsEpContextModelExternalInitializersMinSizeInBytes, "1024000"));
+      ORT_RETURN_IF_ERROR(CreateEpContextModel(providers_, graph, ep_context_path, external_ini_file_name,
+                                               model_external_initializers_min_size_in_bytes, logger));
     }
 #else
     ORT_UNUSED_PARAMETER(config_options);

diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -270,6 +270,74 @@ void SetRawDataInTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto, std::str
   tensor_proto.set_raw_data(std::move(param));
 }
 
+void PackDataIntoTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto,
+                             const std::vector<uint8_t>& unpacked_data,
+                             int64_t data_size) {
+  switch (tensor_proto.data_type()) {
+    case TensorProto_DataType_FLOAT: {
+      const float* data_f = reinterpret_cast<const float*>(unpacked_data.data());
+      tensor_proto.mutable_float_data()->Add(data_f, data_f + data_size);
+      break;
+    }
+
+    case TensorProto_DataType_INT32: {
+      const int32_t* data_i32 = reinterpret_cast<const int32_t*>(unpacked_data.data());
+      tensor_proto.mutable_int32_data()->Add(data_i32, data_i32 + data_size);
+      break;
+    }
+
+    case TensorProto_DataType_UINT32: {
+      const uint32_t* data_ui32 = reinterpret_cast<const uint32_t*>(unpacked_data.data());
+      tensor_proto.mutable_int32_data()->Add(data_ui32, data_ui32 + data_size);
+      break;
+    }
+
+    case TensorProto_DataType_UINT8:
+    case TensorProto_DataType_INT8: {
+      data_size = data_size * sizeof(uint8_t) / sizeof(int32_t);
+      const int32_t* data_i8 = reinterpret_cast<const int32_t*>(unpacked_data.data());
+      tensor_proto.mutable_int32_data()->Add(data_i8, data_i8 + data_size);
+      break;
+    }
+
+    case TensorProto_DataType_UINT16:
+    case TensorProto_DataType_INT16:
+    case TensorProto_DataType_FLOAT16:
+    case TensorProto_DataType_BFLOAT16: {
+      data_size = data_size * sizeof(uint16_t) / sizeof(int32_t);
+      const int32_t* data_16 = reinterpret_cast<const int32_t*>(unpacked_data.data());
+      tensor_proto.mutable_int32_data()->Add(data_16, data_16 + data_size);
+      break;
+    }
+
+    case TensorProto_DataType_UINT64: {
+      const uint64_t* data_ui64 = reinterpret_cast<const uint64_t*>(unpacked_data.data());
+      tensor_proto.mutable_uint64_data()->Add(data_ui64, data_ui64 + data_size);
+      break;
+    }
+
+    case TensorProto_DataType_INT64: {
+      const int64_t* data_i64 = reinterpret_cast<const int64_t*>(unpacked_data.data());
+      tensor_proto.mutable_int64_data()->Add(data_i64, data_i64 + data_size);
+      break;
+    }
+
+    case TensorProto_DataType_DOUBLE: {
+      const double* data_d = reinterpret_cast<const double*>(unpacked_data.data());
+      tensor_proto.mutable_double_data()->Add(data_d, data_d + data_size);
+      break;
+    }
+
+    case TensorProto_DataType_COMPLEX64: {
+      const float* data_c = reinterpret_cast<const float*>(unpacked_data.data());
+      tensor_proto.mutable_float_data()->Add(data_c, data_c + data_size);
+      break;
+    }
+  }
+
+  return;
+}
+
 void ConvertRawDataInTensorProto(TensorProto* tensor) {
   size_t element_size = 1;
   char* bytes = NULL;

diff --git a/onnxruntime/core/framework/tensorprotoutils.h b/onnxruntime/core/framework/tensorprotoutils.h
@@ -74,6 +74,17 @@ void SetRawDataInTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto, T1* raw_
  * @returns                 None
  */
 void SetRawDataInTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto, std::string&& param);
+
+/**
+ * Pack data into TensorProto
+ * @param tensor_proto      given initializer tensor
+ * @param unpacked_data     unpacked data in 8bit vector
+ * @param data_size         size of data in original data type, size got from shape/dims
+ * @returns                 None
+ */
+void PackDataIntoTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto,
+                             const std::vector<uint8_t>& unpacked_data,
+                             int64_t data_size);
 }  // namespace utils
 }  // namespace onnxruntime
 

diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
@@ -4175,6 +4175,15 @@ Status Graph::AddExternalInitializersToGraphProtoImpl(
       size_t tensor_bytes_size = raw_data.size();
       if (tensor_bytes_size < model_saving_options.initializer_size_threshold) {
         *output_proto = initializer;
+        // Data with size above the threshold is written into the new external initializer file
+        // Data with size below the threshold should be kept inside the new model file
+        // instead of leaving it in the old external initializer file for the old Onnx file
+        if (initializer.data_location() == TensorProto_DataLocation_EXTERNAL) {
+          TensorShape shape(initializer.dims());
+          auto data_size = shape.Size();
+          utils::PackDataIntoTensorProto(*output_proto, raw_data, data_size);
+          output_proto->clear_data_location();
+        }
         if (process_prepacks) {
           // These pre-packs will reside in memory
           processed_weights.insert(initializer.name());
@@ -4263,6 +4272,7 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(
 
   // Create the external file.
   std::ofstream external_stream(modified_external_file_path, std::ofstream::out | std::ofstream::binary);
+  auto const external_empty_pos = external_stream.tellp();
   ORT_ENFORCE(external_stream.is_open(), "Failed to open for writing:", modified_external_file_path);
   int64_t external_offset = 0;
 
@@ -4275,6 +4285,12 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(
     ORT_THROW("Failed to flush file with external initializers: ", modified_external_file_path);
   }
 
+  // Delete if the external data file is empty
+  if (external_empty_pos == external_stream.tellp()) {
+    external_stream.close();
+    std::remove(modified_external_file_path.string().c_str());
+  }
+
   return result;
 }