Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: [GPU] No usm host for dgpus when output is small #27558

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ class SyncInferRequest : public ov::ISyncInferRequest {

void init_mappings();
bool is_batched_input(const ov::Output<const ov::Node>& port) const;
uint64_t total_output_bytes = 0;
};

} // namespace intel_gpu
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ class debug_configuration {
int disable_runtime_skip_reorder; // Disable runtime skip reorder
int disable_primitive_fusing; // Disable primitive fusing
int disable_fake_alignment; // Disable fake alignment
int use_usm_host; // Set explicit usm_host usage for network input and output
std::vector<std::string> dynamic_quantize_layers_without_onednn; // Specify Fully-connected layers which enable Dynamic quantization
int use_kv_cache_compression; // Enable KV-cache compression
int dynamic_quantize_group_size; // Enable Dynamic quantization for fully connected primitive by specified group size
Expand Down
18 changes: 13 additions & 5 deletions src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,16 @@

namespace {

inline bool can_use_usm_host(const cldnn::engine& engine) {
inline bool can_use_usm_host(const cldnn::engine& engine, const uint64_t total_output_bytes) {
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->use_usm_host == 1) { return true; }
GPU_DEBUG_IF(debug_config->use_usm_host == 2) { return false; }

auto can_use_usm = engine.use_unified_shared_memory();

const auto& device_info = engine.get_device_info();
if ((device_info.gfx_ver.major == 12 && device_info.gfx_ver.minor == 60) ||
(device_info.gfx_ver.major >= 20 && device_info.dev_type == cldnn::device_type::discrete_gpu)) {
(device_info.dev_type == cldnn::device_type::discrete_gpu && total_output_bytes > 4*1048576)) {
// WA: Disable USM host memory for infer request`s tensors for PVC and subsequent dGPUs, as kernel access
// to system memory is slower than using an explicit memcpy (Host <-> Device) call with the copy engine
// Driver tickets with additional details: 6155, 10054
Expand Down Expand Up @@ -517,7 +521,7 @@ std::shared_ptr<ov::ITensor> SyncInferRequest::create_device_tensor(const ov::Pa
}

// Create OpenCL buffer for PVC if lockable memory is needed due to performance issue with usm host
if (!can_use_usm_host(m_graph->get_engine()) && need_lockable_memory)
if (!can_use_usm_host(m_graph->get_engine(), total_output_bytes) && need_lockable_memory)
tensor_type = TensorType::BT_BUF_INTERNAL;

return std::make_shared<RemoteTensorImpl>(m_context,
Expand Down Expand Up @@ -546,7 +550,9 @@ TensorWrapper SyncInferRequest::create_or_share_device_tensor(const TensorWrappe
auto usm_host_raw_ptr = engine.get_device_info().dev_type == cldnn::device_type::integrated_gpu &&
user_tensor_mem_type == cldnn::allocation_type::usm_host;

bool can_share = !is_convert_required(user_tensor->get_element_type(), element_type) && can_use_usm_host(engine) && !generic_remote_tensor;
bool can_share = !is_convert_required(user_tensor->get_element_type(), element_type)
&& can_use_usm_host(engine, total_output_bytes)
&& !generic_remote_tensor;

if (usm_host_tensor && can_share && m_context == usm_host_tensor->get_impl()->get_context()) {
return { usm_host_tensor->get_impl(), user_tensor_wrapper.owner };
Expand Down Expand Up @@ -635,13 +641,15 @@ void SyncInferRequest::allocate_inputs() {
void SyncInferRequest::allocate_outputs() {
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "SyncInferRequest::allocate_outputs");

total_output_bytes = 0;
// allocate outputs
for (const auto& it : m_output_ports_map) {
size_t output_idx = it.first;
const auto& port = it.second;
GPU_DEBUG_LOG << "[init output blob with index: " << output_idx << "]" << std::endl;

allocate_output(port, output_idx);
total_output_bytes += ov::ISyncInferRequest::get_tensor(port)->get_byte_size();
}
}

Expand Down Expand Up @@ -790,7 +798,7 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_input(const std::string
} else {
m_plugin_inputs[input_idx] = user_tensor_wrapper;
}
} else if (is_usm_host_tensor && !convert_needed && can_use_usm_host(engine)) {
} else if (is_usm_host_tensor && !convert_needed && can_use_usm_host(engine, total_output_bytes)) {
if (element_type != cldnn::element_type_to_data_type(element_type)) {
m_plugin_inputs[input_idx] = { std::make_shared<RemoteTensorImpl>(m_context,
user_tensor->get_shape(),
Expand Down
4 changes: 4 additions & 0 deletions src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,8 @@ static void print_help_messages() {
message_list.emplace_back("OV_GPU_DisableRuntimeSkipReorder", "Disable runtime skip reorder.");
message_list.emplace_back("OV_GPU_DisablePrimitiveFusing", "Disable primitive fusing");
message_list.emplace_back("OV_GPU_DisableFakeAlignment", "Disable fake alignment");
message_list.emplace_back("OV_GPU_UseUsmHost", "Set explicit policy for usm host usage for network input/output. "
"0: default, 1: use usm_host, 2: do not use usm_host");
message_list.emplace_back("OV_GPU_KVCacheCompression", "Enable/Disable KV-cache compression");
message_list.emplace_back("OV_GPU_DynamicQuantizeLayersWithoutOnednn", "Enable Dynamic quantization for specified Fully connected layers only, "
"separated by space. Support case-insensitive and regular expression. For example .*fully_connected.*");
Expand Down Expand Up @@ -254,6 +256,7 @@ debug_configuration::debug_configuration()
, disable_runtime_skip_reorder(0)
, disable_primitive_fusing(0)
, disable_fake_alignment(0)
, use_usm_host(0)
, use_kv_cache_compression(-1)
, dynamic_quantize_group_size(DYNAMIC_QUANTIZE_GROUP_SIZE_NOT_SET)
, disable_horizontal_fc_fusion(0) {
Expand Down Expand Up @@ -307,6 +310,7 @@ debug_configuration::debug_configuration()
get_gpu_debug_env_var("DisableRuntimeSkipReorder", disable_runtime_skip_reorder);
get_gpu_debug_env_var("DisablePrimitiveFusing", disable_primitive_fusing);
get_gpu_debug_env_var("DisableFakeAlignment", disable_fake_alignment);
get_gpu_debug_env_var("UseUsmHost", use_usm_host);
get_gpu_debug_env_var("KVCacheCompression", use_kv_cache_compression);
get_gpu_debug_env_var("DynamicQuantizeGroupSize", dynamic_quantize_group_size);
get_gpu_debug_env_var("DisableHorizontalFCFusion", disable_horizontal_fc_fusion);
Expand Down
Loading