From dc51fb68495a0a9d3512a5d5c3320dba6b85bcd6 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Thu, 16 Jan 2025 03:36:55 +0900 Subject: [PATCH 1/4] merging multiple scalar multiply layers into one --- .../transformations/fc_horizontal_fusion.cpp | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp b/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp index 48e4540384de27..64025480bb4444 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp @@ -300,6 +300,70 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion(bool fuse_mlp_swi } org_fc->clear_control_dependencies(); } + + // Merge scalar multiply layers into one when all scalar constants have the same value. + const auto is_scalar_const = [](const ov::Output& output) -> bool { + if (!ov::is_type(output.get_node())) + return false; + const auto shape = output.get_partial_shape(); + if (shape.is_dynamic()) + return false; + return ov::shape_size(shape.to_shape()) == 1; + }; + + auto get_const_value = [](const std::shared_ptr& const_layer) -> float { + float const_value = -1.f; + if (const_layer->get_element_type() == ov::element::f16) { + const_value = std::stof(const_layer->get_data_ptr()->to_string()); + } else if (const_layer->get_element_type() == ov::element::f32) { + const_value = *const_layer->get_data_ptr(); + } + return const_value; + }; + + float const_value = -1.f; + std::shared_ptr const_node = nullptr; + for (auto& output : output_split->outputs()) { + auto& target_input = *output.get_target_inputs().begin(); + auto target_node = output.get_target_inputs().begin()->get_node(); + if (output.get_target_inputs().size() > 1 || + !ov::is_type(target_node)) { + const_value = -1.f; + break; + } + + for (auto& input : target_node->inputs()) { + if (target_input != input) { + if (is_scalar_const(input.get_source_output())) { + const_node = std::dynamic_pointer_cast( + input.get_source_output().get_node_shared_ptr()); + + if (const_value < 0.f) { + const_value = get_const_value(const_node); + } else if (const_value != get_const_value(const_node)) { + const_value = -1.f; + break; + } + } else { + const_value = -1.f; + break; + } + } + } + + if (const_value < 0.f) + break; + } + + if (const_value > 0.f) { + auto new_mul = std::make_shared(new_fc, const_node); + output_split->input(0).replace_source_output(new_mul); + for (auto& output : output_split->outputs()) { + auto target_node = output.get_target_inputs().begin()->get_node(); + ov::replace_output_update_name(target_node->output(0), output); + } + } + GPU_DEBUG_TRACE_DETAIL << "Created a new fused FC " << new_fc_name << std::endl; return true; }; From 66dbbf35f2d4ab4ab1fe98358c63643fd2c5328d Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Thu, 16 Jan 2025 07:03:14 +0900 Subject: [PATCH 2/4] not to use *begin() --- .../src/plugin/transformations/fc_horizontal_fusion.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp b/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp index 64025480bb4444..2fb5acba53abb0 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp @@ -324,7 +324,6 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion(bool fuse_mlp_swi float const_value = -1.f; std::shared_ptr const_node = nullptr; for (auto& output : output_split->outputs()) { - auto& target_input = *output.get_target_inputs().begin(); auto target_node = output.get_target_inputs().begin()->get_node(); if (output.get_target_inputs().size() > 1 || !ov::is_type(target_node)) { @@ -333,7 +332,7 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion(bool fuse_mlp_swi } for (auto& input : target_node->inputs()) { - if (target_input != input) { + if (input.get_source_output() != output) { if (is_scalar_const(input.get_source_output())) { const_node = std::dynamic_pointer_cast( input.get_source_output().get_node_shared_ptr()); From 1b5add523fb6542523de7dc5ecbe89a9b1f3da2c Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Thu, 16 Jan 2025 21:56:51 +0900 Subject: [PATCH 3/4] added an unit test --- .../transformations/fc_horizontal_fusion.cpp | 11 +++ .../horizontal_fc_fusion_test.cpp | 93 +++++++++++++++++++ 2 files changed, 104 insertions(+) diff --git a/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp b/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp index 2fb5acba53abb0..f9d75a6dfc0b4c 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp @@ -302,6 +302,13 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion(bool fuse_mlp_swi } // Merge scalar multiply layers into one when all scalar constants have the same value. + // + // FusedFC FusedFC + // | | + // VariadicSplit ==> new_Mul (to be fused with FusedFC) + // / | \ | + // Mul Mul Mul VariadicSplit + // | | | | | | const auto is_scalar_const = [](const ov::Output& output) -> bool { if (!ov::is_type(output.get_node())) return false; @@ -356,11 +363,15 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion(bool fuse_mlp_swi if (const_value > 0.f) { auto new_mul = std::make_shared(new_fc, const_node); + new_mul->set_friendly_name(new_fc->get_friendly_name() + "_mul"); + ov::NodeVector fused_mul_nodes; output_split->input(0).replace_source_output(new_mul); for (auto& output : output_split->outputs()) { auto target_node = output.get_target_inputs().begin()->get_node(); + fused_mul_nodes.push_back(target_node->shared_from_this()); ov::replace_output_update_name(target_node->output(0), output); } + ov::copy_runtime_info(fused_mul_nodes, new_mul); } GPU_DEBUG_TRACE_DETAIL << "Created a new fused FC " << new_fc_name << std::endl; diff --git a/src/plugins/intel_gpu/tests/unit/transformations/horizontal_fc_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/horizontal_fc_fusion_test.cpp index af7e6482002ae2..15f225ba6aa2ea 100644 --- a/src/plugins/intel_gpu/tests/unit/transformations/horizontal_fc_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/transformations/horizontal_fc_fusion_test.cpp @@ -18,6 +18,7 @@ #include "openvino/op/variadic_split.hpp" #include "openvino/op/reshape.hpp" #include "openvino/op/add.hpp" +#include "openvino/op/multiply.hpp" #include "openvino/pass/manager.hpp" #include @@ -242,6 +243,98 @@ TEST_F(TransformationTestsF, FullyConnectedHorizontalFusion_eltwise_bias_zp) { comparator.enable(FunctionsComparator::ATTRIBUTES); } } + +TEST_F(TransformationTestsF, FullyConnectedHorizontalFusion_eltwise_bias_zp_scaling) { + std::vector pattern = {7, -1}; + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{-1, 7, 4096}); + auto weight1 = std::make_shared(ov::element::u4, ov::Shape{1024, 4096}); + weight1->set_friendly_name("weight1_1"); + auto weight2 = std::make_shared(ov::element::u4, ov::Shape{512, 4096}); + weight2->set_friendly_name("weight1_2"); + auto weight3 = std::make_shared(ov::element::u4, ov::Shape{128, 4096}); + weight3->set_friendly_name("weight1_3"); + + auto bias1 = std::make_shared(); + auto bias2 = std::make_shared(); + auto bias3 = std::make_shared(); + + auto scale1 = std::make_shared(ov::element::f16, ov::Shape{1024, 32}); + auto scale2 = std::make_shared(ov::element::f16, ov::Shape{512, 32}); + auto scale3 = std::make_shared(ov::element::f16, ov::Shape{128, 32}); + auto fc1 = std::make_shared(input, weight1, bias1, scale1); + fc1->set_friendly_name("fc1"); + auto fc2 = std::make_shared(input, weight2, bias2, scale2); + auto fc3 = std::make_shared(input, weight3, bias3, scale3); + + auto add_input1 = std::make_shared(ov::element::f16, ov::Shape{1, 1024}); + auto add1 = std::make_shared(fc1, add_input1); + + auto add_input2 = std::make_shared(ov::element::f16, ov::Shape{1, 512}); + auto add2 = std::make_shared(fc2, add_input2); + + auto add_input3 = std::make_shared(ov::element::f16, ov::Shape{1, 128}); + auto add3 = std::make_shared(fc3, add_input3); + + const std::vector scale_factor = {8.f}; + auto mul_input1 = std::make_shared(ov::element::f16, ov::Shape{}, scale_factor); + auto mul1 = std::make_shared(add1, mul_input1); + + auto mul_input2 = std::make_shared(ov::element::f16, ov::Shape{}, scale_factor); + auto mul2 = std::make_shared(add2, mul_input2); + + auto mul_input3 = std::make_shared(ov::element::f16, ov::Shape{}, scale_factor); + auto mul3 = std::make_shared(add3, mul_input3); + + auto reshape_pattern = std::make_shared(ov::element::i64, ov::Shape{2}, pattern); + auto reshape1 = std::make_shared(mul1, reshape_pattern, true); + auto reshape2 = std::make_shared(mul2, reshape_pattern, true); + auto reshape3 = std::make_shared(mul3, reshape_pattern, true); + auto result1 = std::make_shared(reshape1); + auto result2 = std::make_shared(reshape2); + auto result3 = std::make_shared(reshape3); + model = std::make_shared(ov::ResultVector{result1, result2, result3}, ov::ParameterVector{input}); + manager.register_pass(); + } + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{-1, 7, 4096}); + auto weight1 = std::make_shared(ov::element::u4, ov::Shape{1024, 4096}); + weight1->set_friendly_name("weight2_1"); + auto weight2 = std::make_shared(ov::element::u4, ov::Shape{512, 4096}); + weight2->set_friendly_name("weight2_2"); + auto weight3 = std::make_shared(ov::element::u4, ov::Shape{128, 4096}); + weight3->set_friendly_name("weight2_3"); + auto weights = ov::OutputVector{weight1, weight2, weight3}; + auto weight_fused = std::make_shared(weights, 0); + auto bias1 = std::make_shared(ov::element::f16, ov::Shape{1, 1024}); + auto bias2 = std::make_shared(ov::element::f16, ov::Shape{1, 512}); + auto bias3 = std::make_shared(ov::element::f16, ov::Shape{1, 128}); + auto biases = ov::OutputVector{bias1, bias2, bias3}; + auto bias_fused = std::make_shared(biases, 1); + auto scale1 = std::make_shared(ov::element::f16, ov::Shape{1024, 32}); + auto scale2 = std::make_shared(ov::element::f16, ov::Shape{512, 32}); + auto scale3 = std::make_shared(ov::element::f16, ov::Shape{128, 32}); + auto scales = ov::OutputVector{scale1, scale2, scale3}; + auto scale_fused = std::make_shared(scales, 0); + auto fc_fused = std::make_shared(input, weight_fused, bias_fused, scale_fused); + const std::vector scale_factor = {8.f}; + auto mul_input = std::make_shared(ov::element::f16, ov::Shape{}, scale_factor); + auto mul = std::make_shared(fc_fused, mul_input); + auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {fc_fused->get_output_partial_shape(0).size() - 1}); + std::vector orig_n_sizes = {1024, 512, 128}; + auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, orig_n_sizes); + auto split = std::make_shared(mul, axis_const, split_const); + auto reshape_pattern = std::make_shared(ov::element::i64, ov::Shape{2}, pattern); + auto reshape1 = std::make_shared(split->output(0), reshape_pattern, true); + auto reshape2 = std::make_shared(split->output(1), reshape_pattern, true); + auto reshape3 = std::make_shared(split->output(2), reshape_pattern, true); + auto result1 = std::make_shared(reshape1); + auto result2 = std::make_shared(reshape2); + auto result3 = std::make_shared(reshape3); + model_ref = std::make_shared(ov::ResultVector{result1, result2, result3}, ov::ParameterVector{input}); + comparator.enable(FunctionsComparator::ATTRIBUTES); + } +} } // namespace intel_gpu } // namespace test } // namespace ov \ No newline at end of file From 234ba1b3349d35b7566f798b603deef182cd1117 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Thu, 16 Jan 2025 22:21:36 +0900 Subject: [PATCH 4/4] default scale factor = 8 --- src/plugins/intel_gpu/src/runtime/execution_config.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp index 16c47b7116853b..44e5c1b8fe0018 100644 --- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp +++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp @@ -61,7 +61,7 @@ void ExecutionConfig::set_default() { std::make_tuple(ov::hint::kv_cache_precision, ov::element::undefined), std::make_tuple(ov::intel_gpu::hint::enable_kernels_reuse, false), std::make_tuple(ov::weights_path, ""), - std::make_tuple(ov::hint::activations_scale_factor, -1.f), + std::make_tuple(ov::hint::activations_scale_factor, 8.f), // Legacy API properties std::make_tuple(ov::intel_gpu::nv12_two_inputs, false),