diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/sync_infer_request.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/sync_infer_request.hpp index cdead8a816626d..916427c280310c 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/sync_infer_request.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/sync_infer_request.hpp @@ -118,6 +118,7 @@ class SyncInferRequest : public ov::ISyncInferRequest { void init_mappings(); bool is_batched_input(const ov::Output& port) const; + uint64_t total_output_bytes = 0; }; } // namespace intel_gpu diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp index 465ed898ecb7ec..a020c5d1cd5ef6 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp @@ -142,6 +142,7 @@ class debug_configuration { int disable_runtime_skip_reorder; // Disable runtime skip reorder int disable_primitive_fusing; // Disable primitive fusing int disable_fake_alignment; // Disable fake alignment + int use_usm_host; // Set explicit usm_host usage for network input and output std::vector dynamic_quantize_layers_without_onednn; // Specify Fully-connected layers which enable Dynamic quantization int use_kv_cache_compression; // Enable KV-cache compression int dynamic_quantize_group_size; // Enable Dynamic quantization for fully connected primitive by specified group size diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp index 985336b801b9d3..bf22f7a6e4c5d0 100644 --- a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp +++ b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp @@ -32,12 +32,16 @@ namespace { -inline bool can_use_usm_host(const cldnn::engine& engine) { +inline bool can_use_usm_host(const cldnn::engine& engine, const uint64_t total_output_bytes) { + GPU_DEBUG_GET_INSTANCE(debug_config); + GPU_DEBUG_IF(debug_config->use_usm_host == 1) { return true; } + GPU_DEBUG_IF(debug_config->use_usm_host == 2) { return false; } + auto can_use_usm = engine.use_unified_shared_memory(); const auto& device_info = engine.get_device_info(); if ((device_info.gfx_ver.major == 12 && device_info.gfx_ver.minor == 60) || - (device_info.gfx_ver.major >= 20 && device_info.dev_type == cldnn::device_type::discrete_gpu)) { + (device_info.dev_type == cldnn::device_type::discrete_gpu && total_output_bytes > 4*1048576)) { // WA: Disable USM host memory for infer request`s tensors for PVC and subsequent dGPUs, as kernel access // to system memory is slower than using an explicit memcpy (Host <-> Device) call with the copy engine // Driver tickets with additional details: 6155, 10054 @@ -517,7 +521,7 @@ std::shared_ptr SyncInferRequest::create_device_tensor(const ov::Pa } // Create OpenCL buffer for PVC if lockable memory is needed due to performance issue with usm host - if (!can_use_usm_host(m_graph->get_engine()) && need_lockable_memory) + if (!can_use_usm_host(m_graph->get_engine(), total_output_bytes) && need_lockable_memory) tensor_type = TensorType::BT_BUF_INTERNAL; return std::make_shared(m_context, @@ -546,7 +550,9 @@ TensorWrapper SyncInferRequest::create_or_share_device_tensor(const TensorWrappe auto usm_host_raw_ptr = engine.get_device_info().dev_type == cldnn::device_type::integrated_gpu && user_tensor_mem_type == cldnn::allocation_type::usm_host; - bool can_share = !is_convert_required(user_tensor->get_element_type(), element_type) && can_use_usm_host(engine) && !generic_remote_tensor; + bool can_share = !is_convert_required(user_tensor->get_element_type(), element_type) + && can_use_usm_host(engine, total_output_bytes) + && !generic_remote_tensor; if (usm_host_tensor && can_share && m_context == usm_host_tensor->get_impl()->get_context()) { return { usm_host_tensor->get_impl(), user_tensor_wrapper.owner }; @@ -635,6 +641,7 @@ void SyncInferRequest::allocate_inputs() { void SyncInferRequest::allocate_outputs() { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "SyncInferRequest::allocate_outputs"); + total_output_bytes = 0; // allocate outputs for (const auto& it : m_output_ports_map) { size_t output_idx = it.first; @@ -642,6 +649,7 @@ void SyncInferRequest::allocate_outputs() { GPU_DEBUG_LOG << "[init output blob with index: " << output_idx << "]" << std::endl; allocate_output(port, output_idx); + total_output_bytes += ov::ISyncInferRequest::get_tensor(port)->get_byte_size(); } } @@ -790,7 +798,7 @@ std::vector SyncInferRequest::prepare_input(const std::string } else { m_plugin_inputs[input_idx] = user_tensor_wrapper; } - } else if (is_usm_host_tensor && !convert_needed && can_use_usm_host(engine)) { + } else if (is_usm_host_tensor && !convert_needed && can_use_usm_host(engine, total_output_bytes)) { if (element_type != cldnn::element_type_to_data_type(element_type)) { m_plugin_inputs[input_idx] = { std::make_shared(m_context, user_tensor->get_shape(), diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp index 5c3b3ee0c970f9..4a68355e1bc8ba 100644 --- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp +++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp @@ -183,6 +183,8 @@ static void print_help_messages() { message_list.emplace_back("OV_GPU_DisableRuntimeSkipReorder", "Disable runtime skip reorder."); message_list.emplace_back("OV_GPU_DisablePrimitiveFusing", "Disable primitive fusing"); message_list.emplace_back("OV_GPU_DisableFakeAlignment", "Disable fake alignment"); + message_list.emplace_back("OV_GPU_UseUsmHost", "Set explicit policy for usm host usage for network input/output. " + "0: default, 1: use usm_host, 2: do not use usm_host"); message_list.emplace_back("OV_GPU_KVCacheCompression", "Enable/Disable KV-cache compression"); message_list.emplace_back("OV_GPU_DynamicQuantizeLayersWithoutOnednn", "Enable Dynamic quantization for specified Fully connected layers only, " "separated by space. Support case-insensitive and regular expression. For example .*fully_connected.*"); @@ -254,6 +256,7 @@ debug_configuration::debug_configuration() , disable_runtime_skip_reorder(0) , disable_primitive_fusing(0) , disable_fake_alignment(0) + , use_usm_host(0) , use_kv_cache_compression(-1) , dynamic_quantize_group_size(DYNAMIC_QUANTIZE_GROUP_SIZE_NOT_SET) , disable_horizontal_fc_fusion(0) { @@ -307,6 +310,7 @@ debug_configuration::debug_configuration() get_gpu_debug_env_var("DisableRuntimeSkipReorder", disable_runtime_skip_reorder); get_gpu_debug_env_var("DisablePrimitiveFusing", disable_primitive_fusing); get_gpu_debug_env_var("DisableFakeAlignment", disable_fake_alignment); + get_gpu_debug_env_var("UseUsmHost", use_usm_host); get_gpu_debug_env_var("KVCacheCompression", use_kv_cache_compression); get_gpu_debug_env_var("DynamicQuantizeGroupSize", dynamic_quantize_group_size); get_gpu_debug_env_var("DisableHorizontalFCFusion", disable_horizontal_fc_fusion);