Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DO NOT REVIEW] scaling test #28388

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,80 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion(bool fuse_mlp_swi
}
org_fc->clear_control_dependencies();
}

// Merge scalar multiply layers into one when all scalar constants have the same value.
//
// FusedFC FusedFC
// | |
// VariadicSplit ==> new_Mul (to be fused with FusedFC)
// / | \ |
// Mul Mul Mul VariadicSplit
// | | | | | |
const auto is_scalar_const = [](const ov::Output<ov::Node>& output) -> bool {
if (!ov::is_type<ov::op::v0::Constant>(output.get_node()))
return false;
const auto shape = output.get_partial_shape();
if (shape.is_dynamic())
return false;
return ov::shape_size(shape.to_shape()) == 1;
};

auto get_const_value = [](const std::shared_ptr<ov::op::v0::Constant>& const_layer) -> float {
float const_value = -1.f;
if (const_layer->get_element_type() == ov::element::f16) {
const_value = std::stof(const_layer->get_data_ptr<ov::float16>()->to_string());
} else if (const_layer->get_element_type() == ov::element::f32) {
const_value = *const_layer->get_data_ptr<float>();
}
return const_value;
};

float const_value = -1.f;
std::shared_ptr<ov::op::v0::Constant> const_node = nullptr;
for (auto& output : output_split->outputs()) {
auto target_node = output.get_target_inputs().begin()->get_node();
if (output.get_target_inputs().size() > 1 ||
!ov::is_type<ov::op::v1::Multiply>(target_node)) {
const_value = -1.f;
break;
}

for (auto& input : target_node->inputs()) {
if (input.get_source_output() != output) {
if (is_scalar_const(input.get_source_output())) {
const_node = std::dynamic_pointer_cast<ov::op::v0::Constant>(
input.get_source_output().get_node_shared_ptr());

if (const_value < 0.f) {
const_value = get_const_value(const_node);
} else if (const_value != get_const_value(const_node)) {
const_value = -1.f;
break;
}
} else {
const_value = -1.f;
break;
}
}
}

if (const_value < 0.f)
break;
}

if (const_value > 0.f) {
auto new_mul = std::make_shared<ov::op::v1::Multiply>(new_fc, const_node);
new_mul->set_friendly_name(new_fc->get_friendly_name() + "_mul");
ov::NodeVector fused_mul_nodes;
output_split->input(0).replace_source_output(new_mul);
for (auto& output : output_split->outputs()) {
auto target_node = output.get_target_inputs().begin()->get_node();
fused_mul_nodes.push_back(target_node->shared_from_this());
ov::replace_output_update_name(target_node->output(0), output);
}
ov::copy_runtime_info(fused_mul_nodes, new_mul);
}

GPU_DEBUG_TRACE_DETAIL << "Created a new fused FC " << new_fc_name << std::endl;
return true;
};
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_gpu/src/runtime/execution_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ void ExecutionConfig::set_default() {
std::make_tuple(ov::hint::kv_cache_precision, ov::element::undefined),
std::make_tuple(ov::intel_gpu::hint::enable_kernels_reuse, false),
std::make_tuple(ov::weights_path, ""),
std::make_tuple(ov::hint::activations_scale_factor, -1.f),
std::make_tuple(ov::hint::activations_scale_factor, 8.f),

// Legacy API properties
std::make_tuple(ov::intel_gpu::nv12_two_inputs, false),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "openvino/op/variadic_split.hpp"
#include "openvino/op/reshape.hpp"
#include "openvino/op/add.hpp"
#include "openvino/op/multiply.hpp"
#include "openvino/pass/manager.hpp"

#include <transformations/utils/utils.hpp>
Expand Down Expand Up @@ -242,6 +243,98 @@ TEST_F(TransformationTestsF, FullyConnectedHorizontalFusion_eltwise_bias_zp) {
comparator.enable(FunctionsComparator::ATTRIBUTES);
}
}

TEST_F(TransformationTestsF, FullyConnectedHorizontalFusion_eltwise_bias_zp_scaling) {
std::vector<int64_t> pattern = {7, -1};
{
auto input = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 7, 4096});
auto weight1 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{1024, 4096});
weight1->set_friendly_name("weight1_1");
auto weight2 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{512, 4096});
weight2->set_friendly_name("weight1_2");
auto weight3 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{128, 4096});
weight3->set_friendly_name("weight1_3");

auto bias1 = std::make_shared<ov::intel_gpu::op::Placeholder>();
auto bias2 = std::make_shared<ov::intel_gpu::op::Placeholder>();
auto bias3 = std::make_shared<ov::intel_gpu::op::Placeholder>();

auto scale1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1024, 32});
auto scale2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{512, 32});
auto scale3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{128, 32});
auto fc1 = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight1, bias1, scale1);
fc1->set_friendly_name("fc1");
auto fc2 = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight2, bias2, scale2);
auto fc3 = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight3, bias3, scale3);

auto add_input1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 1024});
auto add1 = std::make_shared<ov::op::v1::Add>(fc1, add_input1);

auto add_input2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 512});
auto add2 = std::make_shared<ov::op::v1::Add>(fc2, add_input2);

auto add_input3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 128});
auto add3 = std::make_shared<ov::op::v1::Add>(fc3, add_input3);

const std::vector<float> scale_factor = {8.f};
auto mul_input1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{}, scale_factor);
auto mul1 = std::make_shared<ov::op::v1::Multiply>(add1, mul_input1);

auto mul_input2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{}, scale_factor);
auto mul2 = std::make_shared<ov::op::v1::Multiply>(add2, mul_input2);

auto mul_input3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{}, scale_factor);
auto mul3 = std::make_shared<ov::op::v1::Multiply>(add3, mul_input3);

auto reshape_pattern = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{2}, pattern);
auto reshape1 = std::make_shared<ov::op::v1::Reshape>(mul1, reshape_pattern, true);
auto reshape2 = std::make_shared<ov::op::v1::Reshape>(mul2, reshape_pattern, true);
auto reshape3 = std::make_shared<ov::op::v1::Reshape>(mul3, reshape_pattern, true);
auto result1 = std::make_shared<ov::op::v0::Result>(reshape1);
auto result2 = std::make_shared<ov::op::v0::Result>(reshape2);
auto result3 = std::make_shared<ov::op::v0::Result>(reshape3);
model = std::make_shared<ov::Model>(ov::ResultVector{result1, result2, result3}, ov::ParameterVector{input});
manager.register_pass<FullyConnectedHorizontalFusion>();
}
{
auto input = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 7, 4096});
auto weight1 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{1024, 4096});
weight1->set_friendly_name("weight2_1");
auto weight2 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{512, 4096});
weight2->set_friendly_name("weight2_2");
auto weight3 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{128, 4096});
weight3->set_friendly_name("weight2_3");
auto weights = ov::OutputVector{weight1, weight2, weight3};
auto weight_fused = std::make_shared<ov::op::v0::Concat>(weights, 0);
auto bias1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 1024});
auto bias2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 512});
auto bias3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 128});
auto biases = ov::OutputVector{bias1, bias2, bias3};
auto bias_fused = std::make_shared<ov::op::v0::Concat>(biases, 1);
auto scale1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1024, 32});
auto scale2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{512, 32});
auto scale3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{128, 32});
auto scales = ov::OutputVector{scale1, scale2, scale3};
auto scale_fused = std::make_shared<ov::op::v0::Concat>(scales, 0);
auto fc_fused = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight_fused, bias_fused, scale_fused);
const std::vector<float> scale_factor = {8.f};
auto mul_input = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{}, scale_factor);
auto mul = std::make_shared<ov::op::v1::Multiply>(fc_fused, mul_input);
auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {fc_fused->get_output_partial_shape(0).size() - 1});
std::vector<int64_t> orig_n_sizes = {1024, 512, 128};
auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, orig_n_sizes);
auto split = std::make_shared<ov::op::v1::VariadicSplit>(mul, axis_const, split_const);
auto reshape_pattern = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{2}, pattern);
auto reshape1 = std::make_shared<ov::op::v1::Reshape>(split->output(0), reshape_pattern, true);
auto reshape2 = std::make_shared<ov::op::v1::Reshape>(split->output(1), reshape_pattern, true);
auto reshape3 = std::make_shared<ov::op::v1::Reshape>(split->output(2), reshape_pattern, true);
auto result1 = std::make_shared<ov::op::v0::Result>(reshape1);
auto result2 = std::make_shared<ov::op::v0::Result>(reshape2);
auto result3 = std::make_shared<ov::op::v0::Result>(reshape3);
model_ref = std::make_shared<ov::Model>(ov::ResultVector{result1, result2, result3}, ov::ParameterVector{input});
comparator.enable(FunctionsComparator::ATTRIBUTES);
}
}
} // namespace intel_gpu
} // namespace test
} // namespace ov
Loading