openvinotoolkit · e-ddykim · Jan 15, 2025 · Jan 15, 2025 · Jan 16, 2025 · Jan 16, 2025
@@ -300,6 +300,80 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion(bool fuse_mlp_swi
             }
             org_fc->clear_control_dependencies();
         }
+
+        // Merge scalar multiply layers into one when all scalar constants have the same value.
+        //
+        //          FusedFC                     FusedFC
+        //             |                           |
+        //       VariadicSplit      ==>         new_Mul  (to be fused with FusedFC)
+        //      /      |      \                    |
+        //    Mul     Mul     Mul            VariadicSplit
+        //     |       |       |             |     |     |
+        const auto is_scalar_const = [](const ov::Output<ov::Node>& output) -> bool {
+            if (!ov::is_type<ov::op::v0::Constant>(output.get_node()))
+                return false;
+            const auto shape = output.get_partial_shape();
+            if (shape.is_dynamic())
+                return false;
+            return ov::shape_size(shape.to_shape()) == 1;
+        };
+
+        auto get_const_value = [](const std::shared_ptr<ov::op::v0::Constant>& const_layer) -> float {
+            float const_value = -1.f;
+            if (const_layer->get_element_type() == ov::element::f16) {
+                const_value = std::stof(const_layer->get_data_ptr<ov::float16>()->to_string());
+            } else if (const_layer->get_element_type() == ov::element::f32) {
+                const_value = *const_layer->get_data_ptr<float>();
+            }
+            return const_value;
+        };
+
+        float const_value = -1.f;
+        std::shared_ptr<ov::op::v0::Constant> const_node = nullptr;
+        for (auto& output : output_split->outputs()) {
+            auto target_node = output.get_target_inputs().begin()->get_node();
+            if (output.get_target_inputs().size() > 1 ||
+                !ov::is_type<ov::op::v1::Multiply>(target_node)) {
+                const_value = -1.f;
+                break;
+            }
+
+            for (auto& input : target_node->inputs()) {
+                if (input.get_source_output() != output) {
+                    if (is_scalar_const(input.get_source_output())) {
+                        const_node = std::dynamic_pointer_cast<ov::op::v0::Constant>(
+                            input.get_source_output().get_node_shared_ptr());
+
+                        if (const_value < 0.f) {
+                            const_value = get_const_value(const_node);
+                        } else if (const_value != get_const_value(const_node)) {
+                            const_value = -1.f;
+                            break;
+                        }
+                    } else {
+                        const_value = -1.f;
+                        break;
+                    }
+                }
+            }
+
+            if (const_value < 0.f)
+                break;
+        }
+
+        if (const_value > 0.f) {
+            auto new_mul = std::make_shared<ov::op::v1::Multiply>(new_fc, const_node);
+            new_mul->set_friendly_name(new_fc->get_friendly_name() + "_mul");
+            ov::NodeVector fused_mul_nodes;
+            output_split->input(0).replace_source_output(new_mul);
+            for (auto& output : output_split->outputs()) {
+                auto target_node = output.get_target_inputs().begin()->get_node();
+                fused_mul_nodes.push_back(target_node->shared_from_this());
+                ov::replace_output_update_name(target_node->output(0), output);
+            }
+            ov::copy_runtime_info(fused_mul_nodes, new_mul);
+        }
+
         GPU_DEBUG_TRACE_DETAIL << "Created a new fused FC " << new_fc_name << std::endl;
         return true;
     };

@@ -61,7 +61,7 @@ void ExecutionConfig::set_default() {
         std::make_tuple(ov::hint::kv_cache_precision, ov::element::undefined),
         std::make_tuple(ov::intel_gpu::hint::enable_kernels_reuse, false),
         std::make_tuple(ov::weights_path, ""),
-        std::make_tuple(ov::hint::activations_scale_factor, -1.f),
+        std::make_tuple(ov::hint::activations_scale_factor, 8.f),
 
         // Legacy API properties
         std::make_tuple(ov::intel_gpu::nv12_two_inputs, false),

@@ -18,6 +18,7 @@
 #include "openvino/op/variadic_split.hpp"
 #include "openvino/op/reshape.hpp"
 #include "openvino/op/add.hpp"
+#include "openvino/op/multiply.hpp"
 #include "openvino/pass/manager.hpp"
 
 #include <transformations/utils/utils.hpp>
@@ -242,6 +243,98 @@ TEST_F(TransformationTestsF, FullyConnectedHorizontalFusion_eltwise_bias_zp) {
         comparator.enable(FunctionsComparator::ATTRIBUTES);
     }
 }
+
+TEST_F(TransformationTestsF, FullyConnectedHorizontalFusion_eltwise_bias_zp_scaling) {
+    std::vector<int64_t> pattern = {7, -1};
+    {
+        auto input = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 7, 4096});
+        auto weight1 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{1024, 4096});
+        weight1->set_friendly_name("weight1_1");
+        auto weight2 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{512, 4096});
+        weight2->set_friendly_name("weight1_2");
+        auto weight3 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{128, 4096});
+        weight3->set_friendly_name("weight1_3");
+
+        auto bias1 = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto bias2 = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto bias3 = std::make_shared<ov::intel_gpu::op::Placeholder>();
+
+        auto scale1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1024, 32});
+        auto scale2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{512, 32});
+        auto scale3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{128, 32});
+        auto fc1 = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight1, bias1, scale1);
+        fc1->set_friendly_name("fc1");
+        auto fc2 = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight2, bias2, scale2);
+        auto fc3 = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight3, bias3, scale3);
+
+        auto add_input1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 1024});
+        auto add1 = std::make_shared<ov::op::v1::Add>(fc1, add_input1);
+
+        auto add_input2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 512});
+        auto add2 = std::make_shared<ov::op::v1::Add>(fc2, add_input2);
+
+        auto add_input3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 128});
+        auto add3 = std::make_shared<ov::op::v1::Add>(fc3, add_input3);
+
+        const std::vector<float> scale_factor = {8.f};
+        auto mul_input1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{}, scale_factor);
+        auto mul1 = std::make_shared<ov::op::v1::Multiply>(add1, mul_input1);
+
+        auto mul_input2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{}, scale_factor);
+        auto mul2 = std::make_shared<ov::op::v1::Multiply>(add2, mul_input2);
+
+        auto mul_input3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{}, scale_factor);
+        auto mul3 = std::make_shared<ov::op::v1::Multiply>(add3, mul_input3);
+
+        auto reshape_pattern = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{2}, pattern);
+        auto reshape1 = std::make_shared<ov::op::v1::Reshape>(mul1, reshape_pattern, true);
+        auto reshape2 = std::make_shared<ov::op::v1::Reshape>(mul2, reshape_pattern, true);
+        auto reshape3 = std::make_shared<ov::op::v1::Reshape>(mul3, reshape_pattern, true);
+        auto result1 = std::make_shared<ov::op::v0::Result>(reshape1);
+        auto result2 = std::make_shared<ov::op::v0::Result>(reshape2);
+        auto result3 = std::make_shared<ov::op::v0::Result>(reshape3);
+        model = std::make_shared<ov::Model>(ov::ResultVector{result1, result2, result3}, ov::ParameterVector{input});
+        manager.register_pass<FullyConnectedHorizontalFusion>();
+    }
+    {
+        auto input = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 7, 4096});
+        auto weight1 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{1024, 4096});
+        weight1->set_friendly_name("weight2_1");
+        auto weight2 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{512, 4096});
+        weight2->set_friendly_name("weight2_2");
+        auto weight3 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{128, 4096});
+        weight3->set_friendly_name("weight2_3");
+        auto weights = ov::OutputVector{weight1, weight2, weight3};
+        auto weight_fused = std::make_shared<ov::op::v0::Concat>(weights, 0);
+        auto bias1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 1024});
+        auto bias2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 512});
+        auto bias3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 128});
+        auto biases = ov::OutputVector{bias1, bias2, bias3};
+        auto bias_fused = std::make_shared<ov::op::v0::Concat>(biases, 1);
+        auto scale1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1024, 32});
+        auto scale2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{512, 32});
+        auto scale3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{128, 32});
+        auto scales = ov::OutputVector{scale1, scale2, scale3};
+        auto scale_fused = std::make_shared<ov::op::v0::Concat>(scales, 0);
+        auto fc_fused = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight_fused, bias_fused, scale_fused);
+        const std::vector<float> scale_factor = {8.f};
+        auto mul_input = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{}, scale_factor);
+        auto mul = std::make_shared<ov::op::v1::Multiply>(fc_fused, mul_input);
+        auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {fc_fused->get_output_partial_shape(0).size() - 1});
+        std::vector<int64_t> orig_n_sizes = {1024, 512, 128};
+        auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, orig_n_sizes);
+        auto split = std::make_shared<ov::op::v1::VariadicSplit>(mul, axis_const, split_const);
+        auto reshape_pattern = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{2}, pattern);
+        auto reshape1 = std::make_shared<ov::op::v1::Reshape>(split->output(0), reshape_pattern, true);
+        auto reshape2 = std::make_shared<ov::op::v1::Reshape>(split->output(1), reshape_pattern, true);
+        auto reshape3 = std::make_shared<ov::op::v1::Reshape>(split->output(2), reshape_pattern, true);
+        auto result1 = std::make_shared<ov::op::v0::Result>(reshape1);
+        auto result2 = std::make_shared<ov::op::v0::Result>(reshape2);
+        auto result3 = std::make_shared<ov::op::v0::Result>(reshape3);
+        model_ref = std::make_shared<ov::Model>(ov::ResultVector{result1, result2, result3}, ov::ParameterVector{input});
+        comparator.enable(FunctionsComparator::ATTRIBUTES);
+    }
+}
 }  // namespace intel_gpu
 }  // namespace test
 }  // namespace ov